Started: 2020-09-02
Last edited: 2021-01-24 13:25:29

### packages ==================================================================
library(tidyverse)

# single cell
library(Seurat)

# rmd
library(kableExtra)

# plotting
library(patchwork)
library(ggthemes)

# go enrichment
library(clusterProfiler)
library(org.Hs.eg.db)

We can take a first pass at annotating the clusters from our 10x data by comparing them to the reference dataset that we put together. This was done by compiling celltype-averaged expression values from the following single cell RNA-seq studies: (Pavličev et al. 2017; Vento-Tormo et al. 2018; Suryawanshi et al. 2018).

1 Data

1.1 Samples

This is the sample information sent by Alice. Not sure about the INP id for AL09 and AL10. The library for INP188 villi (associated with AL07) was problematic, so is not used.

sample.info <- read_csv("../ext/from-alice/sample_info.csv")

── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  AL_id = col_character(),
  inp_id = col_character(),
  sample_id = col_character(),
  covid = col_character(),
  tissue = col_character()
)
sample.info %>% kable(caption = "Sample information") %>% kable_styling(full_width = FALSE)
Sample information
AL_id inp_id sample_id covid tissue
AL01 INP88 COVID_4 covid decidua
AL02 INP88 COVID_4 covid villi
AL03 INP90 COVID_5 covid decidua
AL04 INP90 COVID_5 covid villi
AL05 INP187 CNTRL_1 cntrl decidua
AL06 INP187 CNTRL_1 cntrl villi
AL07 INP188 CNTRL_2 cntrl decidua
AL08 INP188 CNTRL_2 cntrl villi
AL09 NA CNTRL_3 cntrl decidua
AL10 NA CNTRL_3 cntrl villi

1.2 Seurat-processed data

Read the Seurat-processed (by Eric) data sent by Alice.

seur <- readRDS("../ext/from-alice/placenta.rds")

head(seur@meta.data)

1.2.1 Add sample metadata

The orig.ident column in the metadata represents the sample IDs from the sample info table above. We just have rename them to left-pad the numbers for consistency, and set new cluster names as default idents.

# left pad orig.idents
seur@meta.data$orig.ident <- gsub("(AL)(\\d)$", "\\10\\2", seur@meta.data$orig.ident)

# add additional metadata (covid status, tissue)
seur@meta.data$covid <- sample.info$covid[match(x = seur@meta.data$orig.ident, 
                                                table = sample.info$AL_id)]
seur@meta.data$tissue <- sample.info$tissue[match(x = seur@meta.data$orig.ident,
                                                  table = sample.info$AL_id)]

# rename clusters for consistency
seur$seurat_clusters <- paste0("clust_", seur$seurat_clusters)
seur$seurat_clusters <- gsub("(clust_)(\\d$)", "\\10\\2", as.character(seur$seurat_clusters)) # left pad
seur$seurat_clusters <- factor(seur$seurat_clusters, levels = paste0("clust_", c("00", "01", "02", "03", "04", "05", "06", "07", "08", "09", 10:34)))

# set seurat clusters as default idents
Idents(seur) <- seur@meta.data$seurat_clusters
# add sample_id
seur@meta.data$sample_id <- sample.info$sample_id[match(x = seur@meta.data$orig.ident, 
                                                        table = sample.info$AL_id)]

1.3 Average by cluster

We have to get transcriptomes averaged by clusters, so that we can compare them with the reference data in order to narrow down annotations. For averaging, we will use sctransform normalized data since this is what we use for marker gene detection down the line.

# average sctransformed expression
seur.avg <- AverageExpression(seur)

# averaged data to use for correlations (SCT transformed)
plac <- seur.avg$SCT

1.4 Reference data

# read reference datasets
ref <- read.csv("../results/01_reference-atlas/vento-surya-pavli_joined.csv", stringsAsFactors = FALSE)

1.5 Get data in shape

We will join the data with the reference data so they are both in the same dataframe.

# add gene names column
plac$gene_name <- rownames(plac)

# inner join
plac <- dplyr::inner_join(plac, ref, by = "gene_name")

head(plac)

2 Correlations within the data

Before moving to annotating clusters by comparison with the reference dataset, first we will look at the correlation structure within the dataset.

2.1 Correlation matrix

# build correlation matrix from expression data
cor.matrix <- cor(plac %>% dplyr::select(starts_with("clust")), method = 'spearman')

# reorder correlation matrix based on clustering
dd <- as.dist((1 - cor.matrix)) 
hc <- hclust(dd, method = 'complete')
cormat <- cor.matrix[hc$order, hc$order]

# melt correlation matrix
dat <- reshape2::melt(cormat, na.rm = T)

## plot
p <- ggplot(data = dat, aes(Var1, Var2, fill = value)) +
  geom_tile(colour = "white") +
  scale_fill_gradient(low = 'white', high = 'red',
                      name = "Spearman\nCorrelation") +
  coord_fixed(ratio = 1) +
  labs(title = "Correlations among clusters") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size = 8, 
                                   hjust=1, vjust = 0.5),
        axis.text.y = element_text(size=8),
        axis.ticks.length = unit(0.15, units = c('lines')),
        legend.title = element_text(size = 10),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.key.size = unit(0.8, units = c('lines')))

ggsave(p, filename = "../results/02_annotation/plots/corrplot_clusters.pdf", 
       device = "pdf", width = 7, height = 6, units = "in")

print(p)

2.2 Heirarchical clustering

# build correlation matrix from expression data
cor.matrix <- cor(plac %>% dplyr::select(starts_with("clust")), method = 'spearman')

# reorder correlation matrix based on clustering
dd <- as.dist((1 - cor.matrix)) 
hc <- hclust(dd, method = 'complete')

pdf(file = "../results/02_annotation/plots/hclust_clusters.pdf", width = 7, height = 5)
plot(hc, cex = 0.8)
dev.off()
null device 
          1 
plot(hc, cex = 0.8)

There are three big cluster blocks, with substructure within them. The cluster blocks also correspond well with the UMAP plot that was sent by Alice.

3 Annotation by correlation

Let’s now actually measure correlations between all clusters with the reference datasets. We will treat our data as query dataset and for each cluster assign top 3 cell types in each reference dataset.

refdata <- c("vento", "surya", "pavli")

# build correlation matrix from expression data
cor.matrix <- cor(plac[, names(plac)[names(plac) != "gene_name"]], 
                  method = 'spearman')

# reorder correlation matrix based on clustering
dd <- as.dist((1 - cor.matrix))
hc <- hclust(dd, method = 'complete')
cor.matrix <- cor.matrix[hc$order, hc$order]

# melt correlation matrix
cormat <- reshape2::melt(cor.matrix, na.rm = T)
cormat$Var1_source <- sapply(strsplit(as.character(cormat$Var1), split = "_"), "[[", 1)
cormat$Var2_source <- sapply(strsplit(as.character(cormat$Var2), split = "_"), "[[", 1)

# subset
cormat <- cormat[which(cormat$Var1_source %in% refdata & 
                   cormat$Var2_source == "clust"), ]

# top 3 matches with highest correlation coefficients
topmatch <- data.frame( # empty df to fill top matches from the loop below
  cluster = unique(as.character(cormat$Var2)) %>% sort(),
  vento.top1 = NA,
  surya.top1 = NA,
  pavli.top1 = NA,
  vento.top2 = NA,
  surya.top2 = NA,
  pavli.top2 = NA,
  vento.top3 = NA,
  surya.top3 = NA,
  pavli.top3 = NA
)

cormat$top3 <- NA

for(i in unique(cormat$Var2)){
  for(j in refdata){
    # identify top3 match indices
    ind <- which(cormat$Var2 == i & cormat$Var1_source == j)
    val <- cormat$value[ind]
    top3ind <- ind[order(val, decreasing = TRUE)[1:3]]
    
    # assign match ranking to correlation data for plotting
    cormat$top3[top3ind[1]] <- "1"
    cormat$top3[top3ind[2]] <- "2"
    cormat$top3[top3ind[3]] <- "3"
    
    # assign top matches to topmatches dataframe
    topmatch[topmatch$cluster == i, paste0(j, ".top1")] <- gsub(paste0(j, "_"), "", as.character(cormat$Var1[top3ind[1]]))
    topmatch[topmatch$cluster == i, paste0(j, ".top2")] <- gsub(paste0(j, "_"), "", as.character(cormat$Var1[top3ind[2]]))
    topmatch[topmatch$cluster == i, paste0(j, ".top3")] <- gsub(paste0(j, "_"), "", as.character(cormat$Var1[top3ind[3]]))
  }
}

3.1 Plots

# plotting function
clustAnnoPlot <- function(dat, query, reference) {
  
  dat <- dat[which(dat$Var2_source == query & dat$Var1_source == reference), ]
  
  p <- ggplot(data = dat, 
              aes(Var1, Var2, fill = value)) +
    geom_tile(colour = "white") +
    scale_fill_gradient(low = 'white', high = 'red',
                        name = "Spearman\nCorrelation") +
    geom_point(aes(Var1, Var2, alpha = top3),
               size = 1.5, shape = 19, stroke  = 0) +
    scale_alpha_manual(values = c(1, 0.5, 0.25), 
                       breaks = c(1, 2, 3),
                       name = "top3", na.value = 0) +
    coord_fixed(ratio = 1) +
    xlab("reference") +
    ylab("query") +
    labs(caption = "For each celltype in query, black points represent top 3 celltypes from reference with highest correlation.",
         title = paste0(query, " vs. ", reference)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, size = 8, 
                                     hjust=1, vjust = 0.5),
          axis.text.y = element_text(size=8),
          axis.ticks.length = unit(0.15, units = c('lines')),
          legend.title = element_text(size = 10),
          axis.title = element_text(size = 8),
          plot.caption = element_text(size = 7),
          panel.grid = element_blank(),
          panel.border = element_blank(),
          legend.key.size = unit(0.8, units = c('lines')))
  
  return(p)
}

3.1.1 Against Vento-Tormo et al

## Annotation plots
anno.vento <- clustAnnoPlot(dat = cormat, query = "clust", reference = "vento")
cowplot::ggsave2(anno.vento, device = "pdf", width = 7.5, height = 6.5, units = "in",
                 filename = "../results/02_annotation/plots/annotation-by-correlation_ref-vento.pdf")
print(anno.vento)

3.1.2 Against Suryavanshi et al

## Annotation plots
anno.surya <- clustAnnoPlot(dat = cormat, query = "clust", reference = "surya")
cowplot::ggsave2(anno.surya, device = "pdf", width = 7.5, height = 6.5, units = "in",
                 filename = "../results/02_annotation/plots/annotation-by-correlation_ref-surya.pdf")
print(anno.surya)

3.1.3 Against Pavlicev et al

## Annotation plots
anno.pavli <- clustAnnoPlot(dat = cormat, query = "clust", reference = "pavli")
cowplot::ggsave2(anno.pavli, device = "pdf", width = 7, height = 6, units = "in",
                 filename = "../results/02_annotation/plots/annotation-by-correlation_ref-pavli.pdf")
print(anno.pavli)

3.2 Top matches

topmatch %>% kable(caption = "Top 3 matches against all reference datasets.") %>% kable_styling(full_width = FALSE, font_size = 10)
Top 3 matches against all reference datasets.
cluster vento.top1 surya.top1 pavli.top1 vento.top2 surya.top2 pavli.top2 vento.top3 surya.top3 pavli.top3
clust_00 dS3 dec.DSC DEC dS2 dec.FB1 ESF dP2 dec.FB2 SYN
clust_01 dM2 dec.MAC SYN dM1 vil.HC DEND HB dec.DC2 DEC
clust_02 dP2 dec.FB1 DEC dS2 dec.FB2 SYN dS3 dec.DSC ESF
clust_03 dM1 dec.MAC SYN MO vil.HC DEND dM2 dec.NK1 DEC
clust_04 Endo.m dec.VEC DEC Endo.L dec.LEC ESF Endo.f vil.VEC SYN
clust_05 dNK3 dec.NK1 DEC dNK2 dec.TC ESF Tcells dec.MAC SYN
clust_06 fFB1 dec.FB1 DEC dP2 vil.FB3 SYN dS2 dec.FB2 ESF
clust_07 fFB1 vil.FB3 DEC dP2 dec.FB1 ESF dS2 dec.FB2 SYN
clust_08 dM1 dec.MAC DEND MO vil.HC SYN dM2 dec.DC2 DEC
clust_09 Tcells dec.TC DEC dNK3 dec.NK1 ESF dNK2 dec.NK2 SYN
clust_10 dNK3 dec.NK1 DEC dNK2 dec.TC ESF dNK1 dec.NK2 SYN
clust_11 dM1 dec.MAC DEND dM2 vil.HC DEC dM3 dec.DC2 SYN
clust_12 dM1 dec.MAC SYN dM2 vil.HC DEND dM3 dec.DC2 DEC
clust_13 EVT vil.EVT EVT SCT dec.DSC DEC dP2 dec.FB1 ESF
clust_14 dNK3 dec.NK1 DEC dNK2 dec.TC ESF dNK1 dec.NK2 SYN
clust_15 dP2 dec.MAC DEC dM1 dec.FB2 SYN fFB1 dec.FB1 ESF
clust_16 VCT vil.VCT CYT2 SCT vil.SCT CYT3 EVT vil.EVT CYT1
clust_17 VCT vil.VCT CYT2 SCT vil.SCT CYT3 EVT vil.EVT CYT1
clust_18 dS3 dec.DSC DEC dS2 dec.FB1 ESF dS1 dec.FB2 CYT2
clust_19 dP1 dec.SMC DEC dP2 vil.FB2 ESF Endo.m dec.FB2 SYN
clust_20 Plasma dec.MAC SYN dM1 dec.NK1 DEC dNK3 dec.TC DEND
clust_21 Endo.m dec.VEC DEC Endo.f vil.VEC SYN Endo.L dec.LEC ESF
clust_22 dM2 dec.MAC SYN dM1 vil.HC DEC HB dec.DC2 DEND
clust_23 SCT vil.VCT CYT2 VCT vil.SCT CYT3 EVT vil.EVT CYT1
clust_24 SCT vil.SCT CYT2 VCT vil.VCT CYT3 EVT vil.EVT SYN
clust_25 VCT vil.VCT CYT2 SCT vil.SCT CYT3 EVT vil.EVT CYT1
clust_26 dM1 dec.MAC DEC dM2 vil.HC SYN dM3 dec.VEC ESF
clust_27 dM2 dec.MAC SYN fFB1 vil.HC DEC HB vil.FB3 ESF
clust_28 MO dec.MAC SYN dM1 vil.HC CYT2 dM2 dec.NK2 DEND
clust_29 SCT vil.SCT CYT2 VCT vil.VCT CYT1 EVT vil.EVT CYT3
clust_30 dNK.p dec.NK2 ESF dNK1 dec.NK1 DEND dNK3 dec.MAC CYT2
clust_31 dM2 dec.MAC SYN dM1 vil.HC DEND MO dec.NK1 CYT2
clust_32 Tcells dec.TC SYN dNK3 dec.NK1 DEC dM1 dec.MAC DEND
clust_33 fFB1 vil.FB3 SYN dP2 dec.SMC DEC dP1 dec.FB1 ESF
clust_34 dM2 dec.MAC SYN dM1 vil.HC DEND HB dec.DC2 DEC
NA

3.3 Labels for clusters

We need to create consistent labels for all cell types that we identify. Below is a list I created in which the top level objects are labels we will give to the cell types, within which contained are lists of corresponding labels from vento and surya. These are tentative labels. After the first pass, when we go through the clusters with a fine-toothed comb, we can modify them further. For example, if we have multiple clusters labeled as dec.DSC (decidual macrophages), and if those clusters are distinct, we can then break up this label into dec.DSC1 and dec.DSC2.

labs <- list("dec.DSC"    = list("vento.lab" = c("dS1", "dS2", "dS3"),
                                 "surya.lab" = c("dec.DSC", "dec.FB1", "dec.FB2")),
             "DC"     = list("vento.lab" = c("DC1", "DC2"),
                                 "surya.lab" = c("dec.DC1", "dec.DC2")),
             "Mac"    = list("vento.lab" = c("dM1", "dM2", "dM3"),
                                 "surya.lab" = c("dec.MAC")),
             "NK"     = list("vento.lab" = c("dNK.p", "dNK1", "dNK2", "dNK3", "NK.CD16neg", "NK.CD16pos"),
                                 "surya.lab" = c("dec.NK1", "dec.NK2")),
             "dec.SMC"    = list("vento.lab" = c("dP1", "dP2"),
                                 "surya.lab" = c("dec.SMC")),
             "dec.Endo"   = list("vento.lab" = c("Endo.m"),
                                 "surya.lab" = c("dec.VEC")),
             "dec.Epi"    = list("vento.lab" = c("Epi1", "Epi2"),
                                 "surya.lab" = c("dec.EEC")),
             "Tcell"  = list("vento.lab" = c("Tcells"),
                                 "surya.lab" = c("dec.TC")),
             "vil.Endo"   = list("vento.lab" = c("Endo.f"),
                                 "surya.lab" = c("vil.VEC")),
             "vil.EVT"    = list("vento.lab" = c("EVT"),
                                 "surya.lab" = c("vil.EVT")),
             "vil.FB"     = list("vento.lab" = c("fFB1", "fFB2"),
                                 "surya.lab" = c("vil.FB1", "vil.FB2", "vil.FB3")),
             "vil.Hofb"   = list("vento.lab" = c("HB"),
                                 "surya.lab" = c("vil.HC")),
             "vil.SCT"    = list("vento.lab" = c("SCT"),
                                 "surya.lab" = c("vil.SCT")),
             "vil.VCT"    = list("vento.lab" = c("VCT"),
                                 "surya.lab" = c("vil.VCT")),
             "Gran"   = list("vento.lab" = c("Granulocytes"),
                                 "surya.lab" = c()),
             "ILC"    = list("vento.lab" = c("ILC3"),
                                 "surya.lab" = c()),
             "Mono"   = list("vento.lab" = c("MO"),
                                 "surya.lab" = c()),
             "Plasma" = list("vento.lab" = c("Plasma"),
                                 "surya.lab" = c()),
             "EB"     = list("vento.lab" = c(),
                                 "surya.lab" = c("vil.EB")),
             "unk.Endo.L" = list("vento.lab" = c("Endo.L"),
                                 "surya.lab" = c("dec.LEC"))
)

3.4 Matches consistent between reference datasets

The following clusters have consistent top1 match between vento and surya references, i.e. they correspond to the same cell type category. The pavli reference is too unresolved to be used diagnostically, so we will ignore it for now. For some cell types it is confirmatory, though, e.g. cluster_00 corresponds to DSC in all three references.

# find out which clusters are consistent.
# If vento.top1 and surya.top1 are found in the same item in the labs list, the mapping is consistent.
consistent <- c(NA)
for(i in 1:nrow(topmatch)){
  consistent[i] <- grep(topmatch$vento.top1[i], labs) == grep(topmatch$surya.top1[i], labs)
}

# subset to include consistent set
topmatch.cons <- topmatch %>% 
  filter(consistent) %>% 
  dplyr::select(cluster, vento.top1, surya.top1)

# add our tentative labels to the clusters
for(i in 1:nrow(topmatch.cons)){
  topmatch.cons$label[i] <- names(labs)[grep(topmatch.cons$vento.top1[i], labs)]
}

# print
topmatch.cons[order(topmatch.cons$label), ] %>% knitr::kable() %>% kable_styling(full_width = FALSE)
cluster vento.top1 surya.top1 label
1 clust_00 dS3 dec.DSC dec.DSC
16 clust_18 dS3 dec.DSC dec.DSC
4 clust_04 Endo.m dec.VEC dec.Endo
18 clust_21 Endo.m dec.VEC dec.Endo
17 clust_19 dP1 dec.SMC dec.SMC
2 clust_01 dM2 dec.MAC Mac
3 clust_03 dM1 dec.MAC Mac
7 clust_08 dM1 dec.MAC Mac
10 clust_11 dM1 dec.MAC Mac
11 clust_12 dM1 dec.MAC Mac
19 clust_22 dM2 dec.MAC Mac
22 clust_26 dM1 dec.MAC Mac
23 clust_27 dM2 dec.MAC Mac
26 clust_31 dM2 dec.MAC Mac
29 clust_34 dM2 dec.MAC Mac
5 clust_05 dNK3 dec.NK1 NK
9 clust_10 dNK3 dec.NK1 NK
13 clust_14 dNK3 dec.NK1 NK
25 clust_30 dNK.p dec.NK2 NK
8 clust_09 Tcells dec.TC Tcell
27 clust_32 Tcells dec.TC Tcell
12 clust_13 EVT vil.EVT vil.EVT
6 clust_07 fFB1 vil.FB3 vil.FB
28 clust_33 fFB1 vil.FB3 vil.FB
20 clust_24 SCT vil.SCT vil.SCT
24 clust_29 SCT vil.SCT vil.SCT
14 clust_16 VCT vil.VCT vil.VCT
15 clust_17 VCT vil.VCT vil.VCT
21 clust_25 VCT vil.VCT vil.VCT

3.5 Ambiguous clusters

The top1 matches for some clusters with vento and surya are not the same. The are the clusters that will require a more detailed look to determine their identify.

# subset for ambiguous clusters
topmatch.ambig <- topmatch %>% 
  filter(!consistent) %>% 
  dplyr::select(cluster, vento.top1, surya.top1)

# add our labels.
for(i in 1:nrow(topmatch.ambig)){
  topmatch.ambig$label[i] <- paste0(
    names(labs)[grep(topmatch.ambig$vento.top1[i], labs)],
    " or ",
    names(labs)[grep(topmatch.ambig$surya.top1[i], labs)]
    )
}

# print
topmatch.ambig[order(topmatch.ambig$label), ] %>% kable() %>% kable_styling(full_width = FALSE)
cluster vento.top1 surya.top1 label
1 clust_02 dP2 dec.FB1 dec.SMC or dec.DSC
3 clust_15 dP2 dec.MAC dec.SMC or Mac
6 clust_28 MO dec.MAC Mono or Mac
4 clust_20 Plasma dec.MAC Plasma or Mac
2 clust_06 fFB1 dec.FB1 vil.FB or dec.DSC
5 clust_23 SCT vil.VCT vil.SCT or vil.VCT

4 Annotation refinement

The annotations we have so far are only a starting point. Now we can look at each annotation more carefully to make sure that everything makes sense.

4.1 Sample contribution to clusters

One of the ways in which we can refine the clusters further is by knowing which tissue that sample arises from. For example, if a cluster has macrophage gene signature and if most cells in that cluster come from villi samples, we can further narrow down the identity of the cluster to Hofbauer cells. We can do this by simply counting for each cluster how many cells come from decidua vs villi and by calculating the percentage.

# count cells in each cluster by tissue of origin
bytissue <- table(seur$tissue, seur$seurat_clusters) %>% 
  as.data.frame() %>% 
  dplyr::rename(tissue = Var1, cluster = Var2, frequency = Freq)

# calculate fraction
for(i in 1:nrow(bytissue)){
  bytissue$fraction[i] <- bytissue$frequency[i]/
    sum(bytissue$frequency[bytissue$cluster == bytissue$cluster[i]])
}

# plot
p.cells <- ggplot(data = bytissue, aes(x = frequency, y = cluster)) +
  geom_bar(aes(fill = tissue), stat = "identity", position = "stack") +
  ggtitle("Number of cells")

p.frac <- ggplot(data = bytissue, aes(x = fraction, y = cluster)) +
  geom_bar(aes(fill = tissue), stat = "identity", position = "stack") +
  ggtitle("Fraction of cells")

p.cells + p.frac + plot_layout(guides = "collect", nrow = 2) +
  plot_annotation(caption = "Sample contribution to clusters") &
  coord_flip() &
  scale_fill_tableau(palette = "Classic 10 Medium") &
  theme_minimal() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_line(size = 0.25),
        plot.title = element_text(size = 10),
        axis.text.x = element_text(angle = 90, vjust = 0.5))

NA
NA

Following clusters have more than 70% cells coming from the same tissue of origin.

bytissue[, c("cluster", "tissue", "fraction")] %>% 
  pivot_wider(names_from = "tissue", values_from = "fraction") %>% 
  filter(decidua > 0.70 | villi > 0.70) %>% 
  kable(digits = 2, row.names = FALSE, 
        caption = "Clusters with more than 70% cells from one tissue") %>%
  kable_styling(full_width = FALSE)
Clusters with more than 70% cells from one tissue
cluster decidua villi
clust_00 0.98 0.02
clust_01 0.13 0.87
clust_03 0.15 0.85
clust_04 0.80 0.20
clust_05 0.93 0.07
clust_06 0.19 0.81
clust_07 0.21 0.79
clust_08 0.15 0.85
clust_12 0.77 0.23
clust_13 0.80 0.20
clust_15 0.78 0.22
clust_16 0.08 0.92
clust_17 0.01 0.99
clust_18 0.92 0.08
clust_20 0.15 0.85
clust_22 0.05 0.95
clust_23 0.13 0.87
clust_24 0.02 0.98
clust_25 0.01 0.99
clust_26 0.20 0.80
clust_27 0.02 0.98
clust_28 0.03 0.97
clust_29 0.05 0.95
clust_31 0.01 0.99
clust_33 0.04 0.96
clust_34 0.03 0.97

Following clusters have ambiguous origin, i.e. they contain cells from decidua and villi samples.

bytissue[, c("cluster", "tissue", "fraction")] %>% 
  pivot_wider(names_from = "tissue", values_from = "fraction") %>% 
  filter(decidua < 0.70 & villi < 0.70) %>% 
  kable(digits = 2, row.names = FALSE, 
        caption = "Clusters with cells from both tissues") %>%
  kable_styling(full_width = FALSE)
Clusters with cells from both tissues
cluster decidua villi
clust_02 0.65 0.35
clust_09 0.44 0.56
clust_10 0.43 0.57
clust_11 0.35 0.65
clust_14 0.59 0.41
clust_19 0.54 0.46
clust_21 0.64 0.36
clust_30 0.67 0.33

This is largely consistent with the putative annotations. All putatively trophoblast clusters come overwhelmingly from villi samples. The only exception is extravillous trophoblast cluster, which as expected, comes from decidua samples. At least in the species that I have worked with, it is often difficult to separate the fetal tissue from maternal tissue. Therefore, we use this analysis only as a rough guide, i.e. when there is a conflict between the two, the marker gene-based annotation should take precedence over tissue of origin in cell type id because the latter is experimentally (and as a consequence, analytically) messy to disentangle.

Most clusters that have mixed origin, are putative immune cell type clusters. This could suggest that the immune cell types are present on both maternal and fetal sides of the interface, and because they are the same cell type, they end up in the same cluster. This means that we can’t confidently say for immune cells whether they are decidual or villous. For this reason, when such ambiguity is present, we will label them without the tissue prefix, i.e. for example Tcell instead of dec.Tcell or vil.Tcell.

4.2 Marker genes

4.2.1 Identify marker genes

I ran Seurat::findAllMarkers on this data using the SCT assay. This function outputs a list of marker genes for each cluster compared to all other cells in the data. This takes a while (hours) to run, so I ran it on the cluster and saved the results to a .csv. See the code directory for full code, and see below for the function call.

# set default assay to SCT
DefaultAssay(object = plac) <- "SCT"

# find markers for all clusters
markers <- FindAllMarkers(object = plac, 
                          only.pos = TRUE,
                          logfc.threshold = 0.25)  

# write.csv
write.csv(markers, "/home/arc78/scratch60/covid-placenta/markers_sct.csv", row.names = FALSE)
markers <- read.csv("../results/02_annotation/files/markers_sct.csv")

# rename clusters
markers$cluster <- paste0("clust_", markers$cluster)
markers$cluster <- gsub("(clust_)(\\d)$", "\\10\\2", markers$cluster)
markers$cluster <- factor(markers$cluster, levels = unique(markers$cluster))

# split by cluster
markers <- split.data.frame(markers, markers$cluster)

4.2.2 Plot marker genes

We will plot top 50 marker genes for all clusters and save the plot to pdf.

# plotting function
DotPlot2 <- function(object = seur, assay = "SCT", features, title, ...) {
  p <- DotPlot(object, 
               assay = assay,
               features = features, 
               dot.min = 0.05, dot.scale = 4) +
    coord_flip() +
    labs(caption = paste0(assay, " normalized expression"), title = title) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
          panel.grid = element_blank())
  
  return(p)
}
top50marker.plots <- list()
for(i in names(markers)) {
  if(nrow(markers[[i]]) > 50)
    features <- markers[[i]]$gene[1:50]
  if(nrow(markers[[i]]) < 50)
    features <- markers[[i]]$gene

  top50marker.plots[[i]] <- DotPlot2(features = features, title = i)
         
  cowplot::ggsave2(top50marker.plots[[i]], 
                   filename = paste0("../results/02_annotation/plots/top50markers_dotplot_", i, ".pdf"),
                   width = 7.5, height = 7.5, units = "in")
}

5 Manual annotation

With the marker gene plots we have made, we can examine the top marker genes in each cluster to manually confirm or adjust the annotation of each cluster.

Let’s make a dataframe in which to keep track of the assignments.

annotation <- data.frame(
  cluster = unique(seur$seurat_clusters) %>% sort(),
  annotation = NA,
  notes = NA
)

For each set of marker genes, we can also perform a number of manual checks to assess cell type annotation.

First is GO enrichment using clusterProfiler package, for which below is a function for quickly looking at top 10 enriched GO categories.

# function for GO enrichment
# 1. for a given cluster, prints n.print.rows top enriched GO terms
# 2. Can exclude ribosomal genes (default) because they dominate some GO enrichments
enrichGO2 <- function(clust, ngenes = 100, include.ribo = FALSE, n.print.rows = 10, ...) {
  
  ids.bitr <- bitr(markers[[clust]]$gene[1:ngenes], 
                fromType = "SYMBOL", toType = c("ENTREZID", "SYMBOL"), 
                OrgDb = "org.Hs.eg.db")
    
  if(include.ribo == FALSE)
    features <- ids.bitr$ENTREZID[!(grepl("RP[SL]", ids.bitr$SYMBOL))]
  if(include.ribo == TRUE)
    features <- ids.bitr$ENTREZID

  ego <- enrichGO(
    gene          = features,    
    OrgDb         = org.Hs.eg.db,
    ont           = "BP",
    pAdjustMethod = "BH",
    pvalueCutoff  = 0.01, # default 0.01
    qvalueCutoff  = 0.05, # default 0.05
    readable = TRUE)
  
  dftoprint <- clusterProfiler::simplify(ego)@result[1:n.print.rows, c(2,8)]
  rownames(dftoprint) <- NULL
  
  return(dftoprint)
}

We can also check if the cluster is enriched is genes that are labeled as markers genes for a cell type by other studies. (Vento-Tormo et al. 2018) have provided top 30 marker genes for all cell types identified by them.

# read top 30 markers from vento-tormo et al
markers.vento <- readxl::read_excel("../ext/from-papers/vento-tormo_2018/41586_2018_698_MOESM1_ESM/Supplementary Table 2.xlsx")

# rename columns
markers.vento <- dplyr::rename(markers.vento,
                               VCT_1 = VCT...2,
                               Tcells_1 = `T cells...5`,
                               Tcells_2 = `T cells...9`,
                               VCT_2 = `VCT...10`,
                               Tcells_3 = `T cells...11`,
                               FB_1 = F1,
                               EVT_1 = `EVT...14`,
                               Tcells_4 = `T cells...15`,
                               EVT_2 = `EVT...16`,
                               NKCD16pos = `Blood NK CD16+`,
                               MO_1 = `MO...22`,
                               MO_2 = `MO...27`,
                               NKCD16neg = `Blood NK CD16-`,
                               FB2 = F2,
                               Endo.m = `Endo (m)`,
                               Endo.L = `Endo L`,
                               Endo.f = `Endo (f)` 
)

# remove ensembl ids
markers.vento <- apply(X = markers.vento, MARGIN = 2, FUN = function(x) gsub("_ENSG\\d+", "", x)) %>% 
  as.data.frame()

(Suryawanshi et al. 2018) have also provided a list of top30 marker genes in the supplementary data.

markers.surya.vil <- readxl::read_excel("../ext/from-papers/suryawanshi_2018/supp-data/aau4788_Data_file_S3.xlsx", sheet = "Villi", range = "A1:G271", trim_ws = TRUE)
markers.surya.dec <- readxl::read_excel("../ext/from-papers/suryawanshi_2018/supp-data/aau4788_Data_file_S3.xlsx", sheet = "Deciuda", range = "A1:G331", trim_ws = TRUE)

markers.surya.vil <- dplyr::rename(markers.surya.vil, cluster = `cell type`)

markers.surya.vil$cluster <- paste0("vil.", markers.surya.vil$cluster)
markers.surya.dec$`cluster` <- paste0("dec.", markers.surya.dec$`cluster`)

markers.surya <- rbind(markers.surya.vil, markers.surya.dec)

5.1 DSC

Clusters 0 and 18.

5.1.1 Notes

  1. Clusters 0 and 18 highly correlated with each other and most similar to each other than to other clusters.
  2. Both are consistently similar to the decidual stromal cells in both vento and surya datasets.
  3. The marker genes include Prolactin, IGFBP1 etc, which are typical of decidual stromal cells.
  4. These two clusters also arise unabiguously from the decidua samples rather than villi samples.
annotation$annotation[annotation$cluster %in% c("clust_00")] <- "dec.DSC_1"
annotation$annotation[annotation$cluster %in% c("clust_18")] <- "dec.DSC_2"

annotation[annotation$cluster %in% paste0("clust_", c("00", "18")), ]

5.1.2 Marker genes plots

top50marker.plots[["clust_00"]]

top50marker.plots[["clust_18"]]

5.1.3 Go anrichment

enrichGO2("clust_00")
'select()' returned 1:1 mapping between keys and columns

5.1.4 Vento markers

DotPlot2(features = markers.vento$dS3, title = "vento: dS3")

5.1.5 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.DSC"], title = "surya markers: DSC")

5.2 Endothelial cells

Clusters 4 and 21.

5.2.1 Notes

  1. Clusters 4 and 21 are highly correlated.
  2. They consistently get assigned as decidual endothelial cells by both vento and surya datasets.
  3. More than 75% of the cells in both of these clusters come from the decidua samples.
  4. Markers genes of both clusters are enriched in GO categories related to epithelium development, endothelium development etc.
  5. But these two clusters are not as alike as clusters 0 and 18. They are different in terms of their expression of many endothelium related genes like CD34, PROX1, VWF, SPARCL. Cluster 21 looks more like “typical” endothelial cells. Cluster 4 has low expression of many typical endothelial genes.
  6. Cluster 4 has higher expression of some lymphatic endothelial cells: PROX1, LYVE1. Many genes from the list of surya marker genes for dec.LEC are expressed in cluster 4. This suggests that Cluster 4 may be lymphatic endothelial cells and cluster 21 may be endothelial cells.
annotation$annotation[annotation$cluster %in% c("clust_04")] <- "dec.Endo.L"
annotation$annotation[annotation$cluster %in% c("clust_21")] <- "dec.Endo"

annotation[annotation$cluster %in% paste0("clust_", c("04", "21")), ]

5.2.2 Marker gene plots

top50marker.plots[["clust_04"]]

top50marker.plots[["clust_21"]]

5.2.3 GO enrichment

enrichGO2("clust_04")
'select()' returned 1:1 mapping between keys and columns
2% of input gene IDs are fail to map...
enrichGO2("clust_21")
'select()' returned 1:1 mapping between keys and columns
2% of input gene IDs are fail to map...

5.2.4 Vento markers

DotPlot2(features = markers.vento$Endo.m, title = "vento: Endo.m")

DotPlot2(features = markers.vento$Endo.f, title = "vento: Endo.f")

DotPlot2(features = markers.vento$Endo.L, title = "vento: Endo.L")

5.2.5 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.VEC"], title = "surya markers: vil.VEC")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.VEC"], title = "surya markers: dec.VEC")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.LEC"], title = "surya markers: dec.LEC")

5.3 EVT

Cluster 13.

5.3.1 Notes

  1. This cluster is unique in its expression of HLA-G, typical of extravillous trophoblast.
  2. Several other genes often enriched in EVT are also enriched in this cluster: FN1, PAPPA2, DIO2, etc.
  3. top 30 markers associated with EVT_2 from vento dataset are also highly and specifically expressed in this cluster.
  4. It is consistently assigned to be EVT based on vento, surya, and pavli datasets.

Annotation:
Cluster 13: Extravillous trophoblast (vil.EVT)

annotation$annotation[annotation$cluster %in% c("clust_13")] <- "vil.EVT"

annotation[annotation$cluster %in% c("clust_13"), ]

5.3.2 Marker genes plot

top50marker.plots[["clust_13"]]

5.3.3 GO enrichment

enrichGO2("clust_13")
1% of input gene IDs are fail to map...

5.3.4 Vento markers

DotPlot2(features = markers.vento$EVT_1, title = "vento markers: EVT_1")

DotPlot2(features = markers.vento$EVT_2, title = "vento markers: EVT_2")

5.3.5 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.EVT"], title = "surya markers: vil.EVT")

5.4 VCT and SCT

clusters 16, 17, 23, 24, 25, and 29.

5.4.1 Notes

For more information on expression patterns and markers of different trophoblast types, see (Liu et al. 2018).

  1. In addition to cluster 13 (EVT), there are 6 other clusters that are likely trophoblast cell types (they express KRT7 and are related clusters). These 6 clusters are organized into two correlated blocks (see “correlations section above”): (16, 17, 25) and (23, 29, 24) of which the former are likely VCT and the latter SCT according to correlation-based annotation.
  2. Cluster 29 appears to be SCT given its expression of CGA, GDF15, ERVFRD-1 (Syncytin 2) genes. For some reason GO enrichment didn’t work for markers of cluster 29, but it does consistently express SCT markers from vento and surya.
  3. Cluster 24 doesn’t express ERVFRD-1 much, but it does express most other telltale genes of SCT: CSH1, CSH2, CGA, CSHL1, many PSG genes, GH2, PGF, GDF15, etc. And it is not enriched for expression of VCT markers from vento and surya. It’s enriched GO terms have to do with hormone production and signaling as expected for SCT. Thus, this is most likely an SCT cluster.
  4. Clusters 16 and 17 don’t express many markers of SCT, but do express many vento and surya markers of VCT: PAGE4, PEG10, XAGE3, etc.
  5. Cluster 25 has VCT markers as well as a strong signature of cell division genes. This shows up in the GO enrichment as well, where most enriched GO terms are “nuclear division”, “chromosome segregation”, “cell cycle G1/S phase transition” etc. This is likely a cluster of proliferating VCT cells. Incidentally both (Vento-Tormo et al. 2018) and (Liu et al. 2018) have a separate population of VCT with a cell cycle gene signature (VCT_2 in vento).
  6. Cluster 23 is most similar to SCT of vento but VCT of surya. Indeed this cluster has expression of marker genes for both VCT and SCT (see plots). It expresses PAGE4, PEG10 etc, but also expresses syncytin gene ERVW-1. This could represent a group of cytotrophoblast cells that are in the process of differentiating to sycnytial trophoblast. We will label it as a VCT type.
annotation$annotation[annotation$cluster %in% c("clust_16")] <- "vil.VCT_1"
annotation$annotation[annotation$cluster %in% c("clust_17")] <- "vil.VCT_2"
annotation$annotation[annotation$cluster %in% c("clust_23")] <- "vil.VCT_3"
annotation$annotation[annotation$cluster %in% c("clust_25")] <- "vil.VCT_4"
annotation$annotation[annotation$cluster %in% c("clust_24")] <- "vil.SCT_1"
annotation$annotation[annotation$cluster %in% c("clust_29")] <- "vil.SCT_2"

annotation$notes[annotation$cluster %in% c("clust_25")] <- "proliferating"

annotation[annotation$cluster %in% paste0("clust_", c("16", "17", "23", "25", "24", "29")), ]

5.4.2 Marker gene plots

top50marker.plots[["clust_16"]]

top50marker.plots[["clust_17"]]

top50marker.plots[["clust_25"]]

top50marker.plots[["clust_23"]]

top50marker.plots[["clust_24"]]

top50marker.plots[["clust_29"]]

5.4.3 GO enrichment

enrichGO2("clust_16")
enrichGO2("clust_17")
2.5% of input gene IDs are fail to map...
enrichGO2("clust_25")
3% of input gene IDs are fail to map...
enrichGO2("clust_23")
enrichGO2("clust_24")
1% of input gene IDs are fail to map...
enrichGO2("clust_29")
2% of input gene IDs are fail to map...

5.4.4 Vento markers

DotPlot2(features = markers.vento$VCT_1, title = "vento markers: VCT_1")

DotPlot2(features = markers.vento$VCT_2, title = "vento markers: VCT_2")

DotPlot2(features = markers.vento$SCT, title = "vento markers: SCT")

5.4.5 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.VCT"], title = "surya markers: vil.VCT")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.SCT"], title = "surya markers: vil.SCT")

5.4.6 Pavli markers

DotPlot2(features = c("ADRB1", "PUF60", "SNORDA3A", "PRMT7", "E1F1AY", 
                      "FXYD3", "GRAMD2", "INSL4", "ITGB8", "PAGE4",
                      "SLC13A4", "SLC22A11"), 
         title = "pavli markers: VCT_1")

DotPlot2(features = c("XAGE3", "XAGE2", "SERINC5", "S100P", "RASA1",
                      "MFAP5", "LIN28B", "KRT8", "KRT7", "INS-IGF2", "GCM1",
                      "GATA3", "ERVW-1", "ERVFRD-1", "EGR1", "EFNA1", 
                      "CYP19A1", "PHLDA2", "ACKR2"), 
         title = "pavli markers: VCT_2")

DotPlot2(features = c("SEMA3B", "CSH2", "CSHL1", "FCGR2A", "GH2", "HBA1", 
                      "HBA2", "HBB", "HBG1", "HBG2", "HLA-DMA", "HLA-DPA1",
                      "HLA-DQB1", "HLA-DRA", "HLA-DRB1", "HPGDS", "CGB8",
                      "LGALS14", "LYVE1", "NPIPB3", "CGB5", "PSG1", "PSG3", 
                      "PSG6", "PSG9", "ZNF117", "ZNF91", "HIST2H2AB",
                      "RAMP2", "MUC20"), 
         title = "pavli markers: SCT")

5.4.7 Liu et al markers for trophoblasts

DotPlot2(seur, features = c("GCM1", "CSH1", "KRT7", "TFAP2C", "GATA3", "CDH1", "EGFR", "HLA-G", "MMP2", "RRM2", "CCNB1", "CDK1", "ERVFRD-1", "SLC1A5", "PAGE4", "CGB"), title = "Genes from Liu et al 2018")
The following requested variables were not found: CGB

5.5 Fibroblasts and SMC

Clusters 02, 06, 07, 15, 19, and 33.

5.5.1 Notes

  1. Most of these clusters represent some kind of fibroblast or a closely related cell type. Most clusters are enriched for GO terms related to ECM regulation.
  2. Clusters 02 and 15 marker genes are also highly expressed in clusters 00 and 18. Cluster 15 has more than 75% cells coming from decidua samples, and cluster 02 has over 50% cells coming from decidua samples. Decidual stromal cells are known to be actually a few different populations. Even in Vento-tormo et al, they are labeled as dS1–3, of which dS3 are the canonical DSC with PRL and IGFBP1, while dS1 and dS2 are closely related to DSC and are derived from endometrial stromal fibroblasts. These things together suggest that clusters 02 and 15 are the non-PRL varieties of DSC. To distinguish them from DSC, we can call them decidual fibroblasts.
  3. Marker genes of 06, 07, and 33 show highly correlated gene expressin, i.e. the markers on each of these clusters are also highly expressed in the other two clusters, suggesting that these three clusters a closely related cell populations. All three of these clusters are largely (over 75% of the cells) derived from villi samples, and they are fibroblast-like cells. Thus, these are likely villous fibroblasts.
  4. Cluster 19 is most similar to PV (perivascular) clusters from vento and SMC (smooth muscle cell) cluster from surya. Indeed, its markers include smooth muscle genes like MGP, MYH11, MYL9 etc, and enriched GO terms for this cluster include “muscle contraction” and “artery morphogenesis”. Thus, this cluster is most likely of perivascular smooth muscle cells of decidual origin (over 75% cells from decidua samples).
annotation$annotation[annotation$cluster %in% c("clust_02")] <- "dec.FB_1"
annotation$annotation[annotation$cluster %in% c("clust_15")] <- "dec.FB_2"
annotation$annotation[annotation$cluster %in% c("clust_06")] <- "vil.FB_1"
annotation$annotation[annotation$cluster %in% c("clust_07")] <- "vil.FB_2"
annotation$annotation[annotation$cluster %in% c("clust_33")] <- "vil.FB_3"
annotation$annotation[annotation$cluster %in% c("clust_19")] <- "dec.SMC"

annotation[annotation$cluster %in% paste0("clust_", c("02", "15", "06", "07", "33", "19")), ]

5.5.2 Marker gene plots

top50marker.plots[["clust_02"]]

top50marker.plots[["clust_15"]]

top50marker.plots[["clust_06"]]

top50marker.plots[["clust_07"]]

top50marker.plots[["clust_19"]]

top50marker.plots[["clust_33"]]

5.5.3 GO enrichment

enrichGO2("clust_02")
10.77% of input gene IDs are fail to map...
enrichGO2("clust_15")
2.56% of input gene IDs are fail to map...
enrichGO2("clust_06")
1% of input gene IDs are fail to map...
enrichGO2("clust_07")
3% of input gene IDs are fail to map...
enrichGO2("clust_19")
2% of input gene IDs are fail to map...
enrichGO2("clust_33")
3% of input gene IDs are fail to map...

5.5.4 Vento markers

DotPlot2(features = markers.vento$PV1, title = "vento markers: PV1")

DotPlot2(features = markers.vento$PV2, title = "vento markers: PV2")

5.5.5 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.FB1"], title = "surya markers: vil.FB1")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.SMC"], title = "surya markers: dec.SMC")

5.6 B cells

Cluster 20.

5.6.1 Notes

This is most likely a cluster of B cells. Most markers genes are related to B cells: many immunoglobulin gnees, SPIB, MS4A1, CD79A, CD79B, BANK1, etc. It also expresses many genes that are markers for Plasma cluster from vento dataset. GO enrichment also agrees.

annotation$annotation[annotation$cluster %in% c("clust_20")] <- "Bcell"

annotation[annotation$cluster %in% paste0("clust_", c("20")), ]

5.6.2 Marker genes plot

top50marker.plots[["clust_20"]]

5.6.3 GO enrichment

enrichGO2("clust_20")
1% of input gene IDs are fail to map...

5.6.4 Vento markers

DotPlot2(features = markers.vento$Plasma, title = "vento markers: Plasma")

5.7 Other lymphocytes

Clusters 05, 09, 10, 14, 30, and 32.

5.7.1 Notes

For clues on marker genes for various types of lymphocytes see (Pizzolato et al. 2019).

  1. These are likely non-B lymphoid cells.
  2. Clusters 09 and 32 are most similar to Tcells from both vento and surya. Given the expression of CD3 genes, that appears to be the correct assignment.
    • The clusters also express TRAC and TRBC1 (constant regions of TCR alpha and beta), making them alpha-beta T cells.
    • It’s hard to tell if they are CD4 or CD8—both CD4 and CD8 genes are expressed by a small fraction of cells in both clusters, but CD8 genes expressed at a higher level.
    • GO enrichment for both include terms related to T cell differentiation, T cell activation etc.
    • These clusters contain cells from both decidua and villi samples.
  3. Cluster 05 is most similar to NK cells from vento and surya. But based on the lack of NKG7 and NCAM1 expression and expression of CD3 genes, TRAC, and TRBC1 this is likely a T cell cluster. One of its marker genes being GZMA and its overall similarity with NK cells suggest that this may be a cytotoxic T cell. Consistently, among its enriched GO terms are “T cell mediated cytotoxicity” and “Cell killing”. This cluster mostly contains cells from decidua samples.
  4. Clusters 10, 14, and 30 are NK cells based on the expression of NKG7 and NCAM1.
    • These clusters are also express GZMA, GZMB, GNLY, PRF1, KLRB1, KLRC1, KLRD1.
    • Cluster 30 is enriched for cell cycle genes, so represents proliferating cells. Vento and surya datasets also have a cluster of proliferating NK cells (dNK.p in vento and NK2 in surya). +
    • Cluster 30 in unambiguously of decidual sample origin. Clusters 10 and 14 are mixed.
annotation$annotation[annotation$cluster %in% c("clust_05")] <- "Tcell_1"
annotation$annotation[annotation$cluster %in% c("clust_09")] <- "Tcell_2"
annotation$annotation[annotation$cluster %in% c("clust_10")] <- "NK_1"
annotation$annotation[annotation$cluster %in% c("clust_14")] <- "NK_2"
annotation$annotation[annotation$cluster %in% c("clust_30")] <- "NK_3"
annotation$annotation[annotation$cluster %in% c("clust_32")] <- "Tcell_3"

annotation$notes[annotation$cluster %in% c("clust_05")] <- "cytotoxic"
annotation$notes[annotation$cluster %in% c("clust_30")] <- "proliferating"


annotation[annotation$cluster %in% paste0("clust_", c("05", "09", "10", "14", "30", "32")), ]

5.7.2 Lymphocyte candidate genes

DotPlot2(features = c("NCAM1", "NKG7", "CD3D", "CD3G", "CD3E", "TRAC", "TRBC1", "TRGC2", "TRDC", "CD8A", "CD8B", "CD4"), title = "Lymphocyte candidate genes")

5.7.3 Marker gene plots

top50marker.plots[["clust_05"]]

top50marker.plots[["clust_09"]]

top50marker.plots[["clust_10"]]

top50marker.plots[["clust_14"]]

top50marker.plots[["clust_30"]]

top50marker.plots[["clust_32"]]

5.7.4 GO enrichment

enrichGO2("clust_05")
1% of input gene IDs are fail to map...
enrichGO2("clust_09")
enrichGO2("clust_10")
1% of input gene IDs are fail to map...
enrichGO2("clust_14")
1% of input gene IDs are fail to map...
enrichGO2("clust_30")
4% of input gene IDs are fail to map...
enrichGO2("clust_32")
1% of input gene IDs are fail to map...

5.7.5 Vento markers

DotPlot2(features = markers.vento$Tcells_2, title = "vento markers: Tcells_2")

DotPlot2(features = markers.vento$Tcells_4, title = "vento markers: Tcells_4")

DotPlot2(features = markers.vento$dNK1, title = "vento markers: dNK1")

DotPlot2(features = markers.vento$dNKp, title = "vento markers: dNKp")

5.7.6 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.NK1"], title = "surya markers: dec.NK1")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.NK2"], title = "surya markers: dec.NK2")

5.8 Myeloid cells

Clusters 01, 03, 08, 11, 12, 22, 26, 27, 28, 31, and 34.

5.8.1 Notes

  1. Clusters 01, 22, 31, 34, and 27 form a group in the hierarchical clustering above, and they all have most cells (more than 75%) coming from villi samples.
    • All five clusters are highly highly correlated with each other, especially clusters 01, 31, and 34.
    • All clusters express various macrophage genes: CD68, CD163, MRC1, MAF, C1QC, C1QA, FOLR2 etc; and more so clusters 01, 31, and 34.
    • Clusters 01 and 31 are most likely villous macrophages (i.e. Hofbauer cells), but the other cluster need more investigation.
    • Clusters 22 and 27, even though correlated with 01 and 34, have marker genes that are not that highly expressed in themselves but are in 01 and 34. Looking at the UMAP plot below, it’s clear that both of these clusters are not as homogenous as the other. The cells are spread between the big myeloid blob and the big fibroblast blob.
    • However, these don’t appear to be clusters of different types of cells. That is, fibroblast genes (COL3A1, COL3A3) and macrophage genes are coexpressed in the same cells in cluster 27 (see plot below). And macrophage genes are broadly expressed in cluster 22, even in cells that seem to be embedded with fibroblast cells. Clusters 22 and 27 might represent cell types that are macrophages but with certain fibroblast-like properties.
    • Looking at cluster 34 markers genes, it interestingly expresses many erythrocyte lineage genes (HBA1, HBA2, HBM, CD1, HBG1, CD235a/GYPA) but it also expresses macrophage genes like CSF1R, FOLR2, MRC1, CD163 etc. If we plot the expression of these genes on UMAP (plotting only clust_34, see below), we see that majority of the cells express macrophage genes, while the erythrocyte genes are expressed in a small number of cells (except HBG2), suggesting that this cluster is made of two cell types. Since majority of the cells in this cluster are macrophage cells, we will label it as such, but should the need arise in further analysis, we can subcluster it further to separate the two cell types out.
  2. Cluster 26 is most likely of erythrocytes (or some cell type from the erythrocyte lineage).
    • The surya dataset does contain what they call an erythroblast cluster, but in erythropoeisis is expected in the placenta in the first trimester. In our term samples, I am not sure if erythrocyte progenitors are expected to be present.
    • Marker genes, similarity with surya EB cluster, and GO enrichement (which include “erythrocyte development”, “oxygen transport”, etc) are all consistent with each other.
    • Most of the cells in this cluster come from villi samples, so fetal origin.
  3. Cluster 28 most likely is made of typical granulocytes.
    • It expresses the granulocyte marker gene CEACAM8, in addition to CEACAM6, ELANE, and MPO.
    • GO enrichment is consistent.
    • All marker genes are consistent: LYZ, DEFA3, DEFA4, S100A8, NCF1, etc.
  4. Cluster 11 and 12 are most likely APC.
    • Of maternal origin. Cluster 12 mostly comes from decidua sample, cluster 11 has mixed contribution, in terms of its gene expression it’s very similar to cluster 12.
    • Very similar to each other. Among vento cluster, most similarity to macrophage clusters. Among surya clusters, most similarity to maternal DC and maternal APC clusters.
    • Expression of classical macrophage/DC markers like CD68, CD163. In addition most of the marker genes of this cluster have to do with antigen presentation. Many of them are MHC-II genes.
    • Not sure if they are macrophages or DCs, but to avoid committing to one conclusion over the other, we can just label them as APCs.
  5. Clusters 3 and 8 are difficult.
    • Low, if any, expression of CD68 and CD163, thus not quite macrophages. But at least cluster 8 is quite similar to macrophages in overall gene expression (see hierarchical clustering above).
    • Low, if any, expression of HLA genes, thus not quite APCs.
    • No expression of typical granulocyte markers: CEACAM8, CEACAM6, MPO, ELANE. But many markers of these clusters sound “granulocytic”. This is evident in GO enrichment as well. In addition, cluster 3 is most similar to cluster 28 (see hierarchical clustering above) which is clearly a granulocyte cluster.
    • Cluster 08 is most similar to the MO_1 and MO_2 clusters from vento. Cluster 3 had low similarity with any of the clusters from vento and surya.
    • Both clusters express MNDA, which is typically only expressed in cells in the monocyte-granulocyte lineage.
    • Both clusters express LYZ, S100A9, S100A8, which I think are also monocyte genes.
    • These might be monocytes but they don’t express CD14.
    • It’s in fact possible to have CD14–ve monocyte, it turns out (Mukherjee et al. 2015). See introduction of (Sampath et al. 2018), which summarizes observations from (Villani et al. 2017). These papers point to non-CD14 monocyte populations. In fact most genes in the Mono_3 cluster of (Villani et al. 2017) are expressed in cluster 03, e.g. MXD1, VNN2, CXCR2.
    • Overall it looks like clusters 3 and 8 are two types of monocytes. (at the very least they are from the monocyte-granulocyte lineage.)
annotation$annotation[annotation$cluster %in% c("clust_01")] <- "vil.Hofb_1"
annotation$annotation[annotation$cluster %in% c("clust_22")] <- "vil.Hofb_2"
annotation$annotation[annotation$cluster %in% c("clust_27")] <- "vil.Hofb_3"
annotation$annotation[annotation$cluster %in% c("clust_31")] <- "vil.Hofb_4"
annotation$annotation[annotation$cluster %in% c("clust_34")] <- "vil.Hofb_5"

annotation$annotation[annotation$cluster %in% c("clust_03")] <- "Mono_1"
annotation$annotation[annotation$cluster %in% c("clust_08")] <- "Mono_2"
annotation$annotation[annotation$cluster %in% c("clust_11")] <- "APC_1"
annotation$annotation[annotation$cluster %in% c("clust_12")] <- "APC_2"
annotation$annotation[annotation$cluster %in% c("clust_26")] <- "vil.Ery"
annotation$annotation[annotation$cluster %in% c("clust_28")] <- "Gran"

annotation$notes[annotation$cluster %in% c("clust_27")] <- "also fibroblast signature"
annotation$notes[annotation$cluster %in% c("clust_34")] <- "likely also contains erythro"
annotation$notes[annotation$cluster %in% c("clust_03")] <- "unsure"
annotation$notes[annotation$cluster %in% c("clust_08")] <- "unsure"

annotation[annotation$cluster %in% paste0("clust_", 
                                          c("01", "03", "08", "11", "12",
                                            "22", "26", "27", "28", "31", "34")), ]

5.8.2 Myeloid candidate genes

DotPlot2(features = c("PTPRC", # pan
                      "CD33", "CD14", "CD68", "CD163", "CLEC10A", "HLA-DRA", # apc
                      "CEACAM8", "CEACAM6", "MPO", "ELANE", # granulocytes
                      "FCGR3A", # inflammatory monocytes?
                      "GYPA" # erythro
                      ), 
         title = "Myeloid candidate genes")

5.8.3 Hofb clusters on UMAP

DimPlot(seur, 
        cols = c(rep("Grey90", 30), "#aec7e8", "#ff9e4a", "#2ca02c", "#ed665d", "#9467bd"), 
        order = (rev(c("clust_01", "clust_22", "clust_27", "clust_31", "clust_34")))) + 
  theme_few() +
  coord_fixed()

5.8.4 clust_22 markers on UMAP

DefaultAssay(seur) <- "SCT"
p <- FeaturePlot(seur, features = c("CD163", "CSF1R", "C1QA", "MRC1", "CD14", "FOLR2"), 
                 cells = rownames(seur@meta.data)[seur@meta.data$seurat_clusters == "clust_22"],
                 coord.fixed = TRUE, ncol = 3, combine = FALSE) 

p[[1]] + p[[2]] + p[[3]] + p[[4]] + p[[5]] + p[[6]] + 
  plot_layout(ncol = 3) +
  plot_annotation(title = "clust_22") &
  theme_bw() +
  theme(panel.grid.major = element_line(size = 0.25),
        panel.grid.minor = element_blank(),
        legend.key.size = unit(0.75, "line"),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 10)) 

5.8.5 clust_27 markers on UMAP

DefaultAssay(seur) <- "SCT"
p <- FeaturePlot(seur, features = c("COL3A1", "CD14", "MRC1", "LYVE1", "COL1A1"), cells = rownames(seur@meta.data)[seur@meta.data$seurat_clusters == "clust_27"],
                 coord.fixed = TRUE, ncol = 3) 
p[[1]] + p[[2]] + p[[3]] + p[[4]] + p[[5]] + 
  plot_layout(ncol = 3) +
  plot_annotation(title = "clust_27") &
  theme_bw() +
  theme(panel.grid.major = element_line(size = 0.25),
        panel.grid.minor = element_blank(),
        legend.key.size = unit(0.75, "line"),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 10)) 

5.8.6 clust_34 markers on UMAP

DefaultAssay(seur) <- "SCT"
p <-  FeaturePlot(seur, 
                  features = c("GYPA", "HBM", "HBG2", "HBG1", "ALAS2", "AHSP", 
                               "MRC1", "CD163", "CSF1R"), 
                  cells = rownames(seur@meta.data)[seur@meta.data$seurat_clusters == "clust_34"],
                  coord.fixed = FALSE, ncol = 3)
p[[1]] + p[[2]] + p[[3]] + p[[4]] + p[[5]] + p [[6]] + p[[7]] + p[[8]] + p[[9]] + 
plot_layout(ncol = 3) +
  plot_annotation(title = "clust_34") &
  theme_bw() +
  theme(panel.grid.major = element_line(size = 0.25),
        panel.grid.minor = element_blank(),
        legend.key.size = unit(0.75, "line"),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 10)) 

5.8.7 Marker gene plots

top50marker.plots[["clust_01"]]

top50marker.plots[["clust_03"]]

top50marker.plots[["clust_08"]]

top50marker.plots[["clust_11"]]

top50marker.plots[["clust_12"]]

top50marker.plots[["clust_22"]]

top50marker.plots[["clust_26"]]

top50marker.plots[["clust_27"]]

top50marker.plots[["clust_28"]]

top50marker.plots[["clust_31"]]

top50marker.plots[["clust_34"]]

5.8.8 GO enrichment

enrichGO2("clust_01")
2% of input gene IDs are fail to map...
enrichGO2("clust_03")
4% of input gene IDs are fail to map...
enrichGO2("clust_08")
1% of input gene IDs are fail to map...
enrichGO2("clust_11")
enrichGO2("clust_12")
1% of input gene IDs are fail to map...
enrichGO2("clust_22")
5.48% of input gene IDs are fail to map...
enrichGO2("clust_26")
4.05% of input gene IDs are fail to map...
enrichGO2("clust_27")
1% of input gene IDs are fail to map...
enrichGO2("clust_28")
3% of input gene IDs are fail to map...
enrichGO2("clust_31")
2% of input gene IDs are fail to map...
enrichGO2("clust_34")
4% of input gene IDs are fail to map...

5.8.9 Vento markers

DotPlot2(features = markers.vento$dM1, title = "vento markers: dM1")

DotPlot2(features = markers.vento$dM2, title = "vento markers: dM2")

DotPlot2(features = markers.vento$HB, title = "vento markers: HB")

DotPlot2(features = markers.vento$M3, title = "vento markers: M3")

DotPlot2(features = markers.vento$MO_1, title = "vento markers: MO_1")

DotPlot2(features = markers.vento$MO_2, title = "vento markers: MO_2")

DotPlot2(features = markers.vento$DC1, title = "vento markers: DC1")

DotPlot2(features = markers.vento$DC2, title = "vento markers: DC2")

5.8.10 Surya markers

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.HC"], title = "surya markers: vil.HC")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.EB"], title = "surya markers: vil.EB")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.APC"], title = "surya markers: dec.APC")

DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.HC"], title = "surya markers: vil.HC")

6 Final annotations

6.1 Merge similar clusters

In addition to the annotations so far, we can have a second set of coarser-grained annotations so that we have fewer clusters to deal with. For this we will merge some similar clusters—mostly non-immune clusters, leaving all immune clusters intact.

annotation$annotation_merged <- annotation$annotation

annotation$annotation_merged[annotation$annotation %in% paste0("vil.Hofb_", c(1:5))] <- "vil.Hofb"
annotation$annotation_merged[annotation$annotation %in% paste0("dec.DSC_", c(1:2))] <- "dec.DSC"
annotation$annotation_merged[annotation$annotation %in% paste0("dec.FB_", c(1:3))] <- "dec.FB"
annotation$annotation_merged[annotation$annotation %in% paste0("vil.FB_", c(1:3))] <- "vil.FB"
annotation$annotation_merged[annotation$annotation %in% paste0("APC_", c(1:2))] <- "APC"
annotation$annotation_merged[annotation$annotation %in% paste0("vil.VCT_", c(1:4))] <- "vil.VCT"
annotation$annotation_merged[annotation$annotation %in% paste0("vil.SCT_", c(1:2))] <- "vil.SCT"
annotation$annotation_merged[annotation$annotation %in% c("dec.Endo", "dec.Endo.L")] <- "dec.Endo"
# rearrange columns and write annotations to csv
annotation <- annotation[, c("cluster", "annotation", "annotation_merged", "notes")]

write.csv(annotation, "../results/02_annotation/files/annotations.csv", row.names = FALSE)
annotation %>% kable(caption = "Final annotations of all clusters") %>% kable_styling(full_width = FALSE)
Final annotations of all clusters
cluster annotation notes annotation_merged
clust_00 dec.DSC_1 NA dec.DSC
clust_01 vil.Hofb_1 NA vil.Hofb
clust_02 dec.FB_1 NA dec.FB
clust_03 Mono_1 unsure Mono_1
clust_04 dec.Endo.L NA dec.Endo
clust_05 Tcell_1 cytotoxic Tcell_1
clust_06 vil.FB_1 NA vil.FB
clust_07 vil.FB_2 NA vil.FB
clust_08 Mono_2 unsure Mono_2
clust_09 Tcell_2 NA Tcell_2
clust_10 NK_1 NA NK_1
clust_11 APC_1 NA APC
clust_12 APC_2 NA APC
clust_13 vil.EVT NA vil.EVT
clust_14 NK_2 NA NK_2
clust_15 dec.FB_2 NA dec.FB
clust_16 vil.VCT_1 NA vil.VCT
clust_17 vil.VCT_2 NA vil.VCT
clust_18 dec.DSC_2 NA dec.DSC
clust_19 dec.SMC NA dec.SMC
clust_20 Bcell NA Bcell
clust_21 dec.Endo NA dec.Endo
clust_22 vil.Hofb_2 NA vil.Hofb
clust_23 vil.VCT_3 NA vil.VCT
clust_24 vil.SCT_1 NA vil.SCT
clust_25 vil.VCT_4 proliferating vil.VCT
clust_26 vil.Ery NA vil.Ery
clust_27 vil.Hofb_3 also fibroblast signature vil.Hofb
clust_28 Gran NA Gran
clust_29 vil.SCT_2 NA vil.SCT
clust_30 NK_3 proliferating NK_3
clust_31 vil.Hofb_4 NA vil.Hofb
clust_32 Tcell_3 NA Tcell_3
clust_33 vil.FB_3 NA vil.FB
clust_34 vil.Hofb_5 likely also contains erythro vil.Hofb

6.2 Add annotations to Seurat object

# add annotations
seur@meta.data$annotation <- annotation$annotation[match(
  x = seur@meta.data$seurat_clusters,
  table = annotation$cluster)
  ]

seur@meta.data$annotation_merged <- annotation$annotation_merged[match(
  x = seur@meta.data$seurat_clusters, 
  table = annotation$cluster)
  ]
# write seurat object
saveRDS(seur, file = "../data/seurat-object_annotated.rds")

6.3 Plots

6.3.1 UMAP: all annotations

Idents(seur) <- seur@meta.data$annotation
DimPlot(seur, label = TRUE, repel = TRUE, label.size = 4, shuffle = TRUE) +
  coord_fixed() +
  theme_minimal() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        axis.title = element_text(size = 8))
Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
Please use `as_label()` or `as_name()` instead.
This warning is displayed once per session.

ggsave(last_plot(), filename = "../results/02_annotation/plots/umap_annotated.pdf", 
       device = "pdf", width = 7, height = 7, units = "in")

6.3.2 UMAP: merged annotations

Idents(seur) <- seur@meta.data$annotation_merged
DimPlot(seur, label = TRUE, repel = TRUE, label.size = 4, shuffle = TRUE) +
  coord_fixed() +
  theme_minimal() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        axis.title = element_text(size = 8))


ggsave(last_plot(), filename = "../results/02_annotation/plots/umap_annotated_merged.pdf", 
       device = "pdf", width = 7, height = 7, units = "in")

7 Session Info

sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Mojave 10.14.6

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] org.Hs.eg.db_3.11.4    AnnotationDbi_1.50.3   IRanges_2.22.2         S4Vectors_0.26.1       Biobase_2.48.0        
 [6] BiocGenerics_0.34.0    clusterProfiler_3.16.1 ggthemes_4.2.4         patchwork_1.1.1        kableExtra_1.3.1      
[11] Seurat_3.2.3           forcats_0.5.0          stringr_1.4.0          dplyr_1.0.3            purrr_0.3.4           
[16] readr_1.4.0            tidyr_1.1.2            tibble_3.0.5           ggplot2_3.3.3          tidyverse_1.3.0       

loaded via a namespace (and not attached):
  [1] reticulate_1.18       tidyselect_1.1.0      RSQLite_2.2.2         htmlwidgets_1.5.3     grid_4.0.2           
  [6] BiocParallel_1.22.0   Rtsne_0.15            scatterpie_0.1.5      munsell_0.5.0         codetools_0.2-18     
 [11] ica_1.0-2             future_1.21.0         miniUI_0.1.1.1        withr_2.4.0           colorspace_2.0-0     
 [16] GOSemSim_2.14.2       highr_0.8             knitr_1.30            rstudioapi_0.13       ROCR_1.0-11          
 [21] tensor_1.5            DOSE_3.14.0           listenv_0.8.0         labeling_0.4.2        urltools_1.7.3       
 [26] polyclip_1.10-0       bit64_4.0.5           farver_2.0.3          downloader_0.4        parallelly_1.23.0    
 [31] vctrs_0.3.6           generics_0.1.0        xfun_0.20             R6_2.5.0              graphlayouts_0.7.1   
 [36] rsvd_1.0.3            spatstat.utils_1.20-2 fgsea_1.14.0          gridGraphics_0.5-1    assertthat_0.2.1     
 [41] promises_1.1.1        scales_1.1.1          ggraph_2.0.4          enrichplot_1.8.1      gtable_0.3.0         
 [46] globals_0.14.0        goftest_1.2-2         tidygraph_1.2.0       rlang_0.4.10          splines_4.0.2        
 [51] lazyeval_0.2.2        broom_0.7.3           europepmc_0.4         BiocManager_1.30.10   yaml_2.2.1           
 [56] reshape2_1.4.4        abind_1.4-5           modelr_0.1.8          backports_1.2.1       httpuv_1.5.5         
 [61] qvalue_2.20.0         tools_4.0.2           ggplotify_0.0.5       ellipsis_0.3.1        RColorBrewer_1.1-2   
 [66] ggridges_0.5.3        Rcpp_1.0.6            plyr_1.8.6            progress_1.2.2        prettyunits_1.1.1    
 [71] rpart_4.1-15          deldir_0.2-9          pbapply_1.4-3         viridis_0.5.1         cowplot_1.1.1        
 [76] zoo_1.8-8             haven_2.3.1           ggrepel_0.9.1         cluster_2.1.0         fs_1.5.0             
 [81] magrittr_2.0.1        data.table_1.13.6     scattermore_0.7       DO.db_2.9             lmtest_0.9-38        
 [86] triebeard_0.3.0       reprex_0.3.0          RANN_2.6.1            fitdistrplus_1.1-3    matrixStats_0.57.0   
 [91] hms_1.0.0             mime_0.9              evaluate_0.14         xtable_1.8-4          readxl_1.3.1         
 [96] gridExtra_2.3         compiler_4.0.2        KernSmooth_2.23-18    crayon_1.3.4          htmltools_0.5.1      
[101] mgcv_1.8-33           later_1.1.0.1         lubridate_1.7.9.2     DBI_1.1.1             tweenr_1.0.1         
[106] dbplyr_2.0.0          MASS_7.3-53           Matrix_1.3-2          cli_2.2.0             igraph_1.2.6         
[111] pkgconfig_2.0.3       rvcheck_0.1.8         plotly_4.9.3          xml2_1.3.2            webshot_0.5.2        
[116] rematch_1.0.1         rvest_0.3.6           digest_0.6.27         sctransform_0.3.2     RcppAnnoy_0.0.18     
[121] spatstat.data_1.7-0   rmarkdown_2.6         cellranger_1.1.0      leiden_0.3.6          fastmatch_1.1-0      
[126] uwot_0.1.10           shiny_1.5.0           lifecycle_0.2.0       nlme_3.1-151          jsonlite_1.7.2       
[131] viridisLite_0.3.0     fansi_0.4.2           pillar_1.4.7          lattice_0.20-41       fastmap_1.0.1        
[136] httr_1.4.2            survival_3.2-7        GO.db_3.11.4          glue_1.4.2            spatstat_1.64-1      
[141] png_0.1-7             bit_4.0.4             ggforce_0.3.2         stringi_1.5.3         blob_1.2.1           
[146] memoise_1.1.0         irlba_2.3.3           future.apply_1.7.0   

8 References

Liu, Y., X. Fan, R. Wang, X. Lu, Y-L. Dang, H. Wang, H-Y. Lin, et al. 2018. “Single-Cell RNA-Seq Reveals the Diversity of Trophoblast Subtypes and Patterns of Differentiation in the Human Placenta.” Cell Research 28 (8): 819–32. https://doi.org/10.1038/s41422-018-0066-y.

Mukherjee, R., P. Kanti Barman, P. Kumar Thatoi, R. Tripathy, B. Kumar Das, and B. Ravindran. 2015. “Non-Classical Monocytes Display Inflammatory Features: Validation in Sepsis and Systemic Lupus Erythematous.” Scientific Reports 5 (1): 13886.

Pavličev, M., G. P. Wagner, A. R. Chavan, K. Owens, J. Maziarz, C. Dunn-Fletcher, S. G. Kallapur, L. Muglia, and H. Jones. 2017. “Single-Cell Transcriptomics of the Human Placenta: Inferring the Cell Communication Network of the Maternal-Fetal Interface.” Genome Research. https://doi.org/10.1101/gr.207597.116.

Pizzolato, G., H. Kaminski, M. Tosolini, D-M. Franchini, F. Pont, F. Martins, C. Valle, et al. 2019. “Single-Cell Rna Sequencing Unveils the Shared and the Distinct Cytotoxic Hallmarks of Human Tcrvδ1 and Tcrvδ2 γδ T Lymphocytes.” Proceedings of the National Academy of Sciences 116 (24): 11906–15. https://doi.org/10.1073/pnas.1818488116.

Sampath, P., K. Moideen, U. D. Ranganathan, and R. Bethunaickan. 2018. “Monocyte Subsets: Phenotypes and Function in Tuberculosis Infection.” Frontiers in Immunology 9: 1726. https://doi.org/10.3389/fimmu.2018.01726.

Suryawanshi, H., P. Morozov, A. Straus, N. Sahasrabudhe, K. E. A. Max, A. Garzia, M. Kustagi, T. Tuschl, and Z. Williams. 2018. “A Single-Cell Survey of the Human First-Trimester Placenta and Decidua.” Science Advances 4 (10): eaau4788. https://doi.org/10.1126/sciadv.aau4788.

Vento-Tormo, R., M. Efremova, R. A. Botting, M. Y. Turco, M. Vento-Tormo, K. B. Meyer, J-E. Park, et al. 2018. “Single-Cell Reconstruction of the Early Maternal–Fetal Interface in Humans.” Nature 563 (7731): 347–53. https://doi.org/10.1038/s41586-018-0698-6.

Villani, A-C., R. Satija, G. Reynolds, S. Sarkizova, K. Shekhar, J. Fletcher, M. Griesbeck, et al. 2017. “Single-Cell Rna-Seq Reveals New Types of Human Blood Dendritic Cells, Monocytes, and Progenitors.” Science 356 (6335). https://doi.org/10.1126/science.aah4573.

---
title: "Annotation of clusters in 10x data"
author: "Arun Chavan"
output:
  html_notebook:
    toc: yes
    toc_float: yes
    number_sections: true
bibliography: ../refs.bib
---
Started: 2020-09-02  
Last edited: `r format(Sys.time())`

```{r message=FALSE, warning=FALSE}
### packages ==================================================================
library(tidyverse)

# single cell
library(Seurat)

# rmd
library(kableExtra)

# plotting
library(patchwork)
library(ggthemes)

# go enrichment
library(clusterProfiler)
library(org.Hs.eg.db)
```

We can take a first pass at annotating the clusters from our 10x data by comparing them to the reference dataset that we put together. This was done by compiling celltype-averaged expression values from the following single cell RNA-seq studies: [@pavlicev_single-cell_2017; @vento-tormo_single-cell_2018; @suryawanshi_single-cell_2018]. 

# Data

## Samples
This is the sample information sent by Alice. Not sure about the INP id for `AL09` and `AL10`. The library for `INP188` villi (associated with `AL07`) was problematic, so is not used.  

```{r}
sample.info <- read_csv("../ext/from-alice/sample_info.csv")
sample.info %>% kable(caption = "Sample information") %>% kable_styling(full_width = FALSE)
```

## Seurat-processed data
Read the `Seurat`-processed (by Eric) data sent by Alice.  

```{r}
seur <- readRDS("../ext/from-alice/placenta.rds")

head(seur@meta.data)
```
### Add sample metadata

The `orig.ident` column in the metadata represents the sample IDs from the sample info table above. We just have rename them to left-pad the numbers for consistency, and set new cluster names as default idents.

```{r}
# left pad orig.idents
seur@meta.data$orig.ident <- gsub("(AL)(\\d)$", "\\10\\2", seur@meta.data$orig.ident)

# add additional metadata (covid status, tissue)
seur@meta.data$covid <- sample.info$covid[match(x = seur@meta.data$orig.ident, 
                                                table = sample.info$AL_id)]
seur@meta.data$tissue <- sample.info$tissue[match(x = seur@meta.data$orig.ident,
                                                  table = sample.info$AL_id)]

# rename clusters for consistency
seur$seurat_clusters <- paste0("clust_", seur$seurat_clusters)
seur$seurat_clusters <- gsub("(clust_)(\\d$)", "\\10\\2", as.character(seur$seurat_clusters)) # left pad
seur$seurat_clusters <- factor(seur$seurat_clusters, levels = paste0("clust_", c("00", "01", "02", "03", "04", "05", "06", "07", "08", "09", 10:34)))

# set seurat clusters as default idents
Idents(seur) <- seur@meta.data$seurat_clusters
```

```{r}
# add sample_id
seur@meta.data$sample_id <- sample.info$sample_id[match(x = seur@meta.data$orig.ident, 
                                                        table = sample.info$AL_id)]
```

## Average by cluster
We have to get transcriptomes averaged by clusters, so that we can compare them with the reference data in order to narrow down annotations. For averaging, we will use `sctransform` normalized data since this is what we use for marker gene detection down the line.

```{r message=FALSE}
# average sctransformed expression
seur.avg <- AverageExpression(seur)

# averaged data to use for correlations (SCT transformed)
plac <- seur.avg$SCT
```

## Reference data
```{r}
# read reference datasets
ref <- read.csv("../results/01_reference-atlas/vento-surya-pavli_joined.csv", stringsAsFactors = FALSE)
```

## Get data in shape
We will join the data with the reference data so they are both in the same dataframe. 

```{r}
# add gene names column
plac$gene_name <- rownames(plac)

# inner join
plac <- dplyr::inner_join(plac, ref, by = "gene_name")

head(plac)
```

# Correlations within the data
Before moving to annotating clusters by comparison with the reference dataset, first we will look at the correlation structure within the dataset.

## Correlation matrix
```{r}
# build correlation matrix from expression data
cor.matrix <- cor(plac %>% dplyr::select(starts_with("clust")), method = 'spearman')

# reorder correlation matrix based on clustering
dd <- as.dist((1 - cor.matrix)) 
hc <- hclust(dd, method = 'complete')
cormat <- cor.matrix[hc$order, hc$order]

# melt correlation matrix
dat <- reshape2::melt(cormat, na.rm = T)

## plot
p <- ggplot(data = dat, aes(Var1, Var2, fill = value)) +
  geom_tile(colour = "white") +
  scale_fill_gradient(low = 'white', high = 'red',
                      name = "Spearman\nCorrelation") +
  coord_fixed(ratio = 1) +
  labs(title = "Correlations among clusters") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size = 8, 
                                   hjust=1, vjust = 0.5),
        axis.text.y = element_text(size=8),
        axis.ticks.length = unit(0.15, units = c('lines')),
        legend.title = element_text(size = 10),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.key.size = unit(0.8, units = c('lines')))

ggsave(p, filename = "../results/02_annotation/plots/corrplot_clusters.pdf", 
       device = "pdf", width = 7, height = 6, units = "in")

print(p)
```

## Heirarchical clustering
```{r}
# build correlation matrix from expression data
cor.matrix <- cor(plac %>% dplyr::select(starts_with("clust")), method = 'spearman')

# reorder correlation matrix based on clustering
dd <- as.dist((1 - cor.matrix)) 
hc <- hclust(dd, method = 'complete')

pdf(file = "../results/02_annotation/plots/hclust_clusters.pdf", width = 7, height = 5)
plot(hc, cex = 0.8)
dev.off()
plot(hc, cex = 0.8)
```

There are three big cluster blocks, with substructure within them. The cluster blocks also correspond well with the UMAP plot that was sent by Alice. 

# Annotation by correlation
Let's now actually measure correlations between all clusters with the reference datasets. We will treat our data as query dataset and for each cluster assign top 3 cell types in each reference dataset.

```{r}
refdata <- c("vento", "surya", "pavli")

# build correlation matrix from expression data
cor.matrix <- cor(plac[, names(plac)[names(plac) != "gene_name"]], 
                  method = 'spearman')

# reorder correlation matrix based on clustering
dd <- as.dist((1 - cor.matrix))
hc <- hclust(dd, method = 'complete')
cor.matrix <- cor.matrix[hc$order, hc$order]

# melt correlation matrix
cormat <- reshape2::melt(cor.matrix, na.rm = T)
cormat$Var1_source <- sapply(strsplit(as.character(cormat$Var1), split = "_"), "[[", 1)
cormat$Var2_source <- sapply(strsplit(as.character(cormat$Var2), split = "_"), "[[", 1)

# subset
cormat <- cormat[which(cormat$Var1_source %in% refdata & 
                   cormat$Var2_source == "clust"), ]

# top 3 matches with highest correlation coefficients
topmatch <- data.frame( # empty df to fill top matches from the loop below
  cluster = unique(as.character(cormat$Var2)) %>% sort(),
  vento.top1 = NA,
  surya.top1 = NA,
  pavli.top1 = NA,
  vento.top2 = NA,
  surya.top2 = NA,
  pavli.top2 = NA,
  vento.top3 = NA,
  surya.top3 = NA,
  pavli.top3 = NA
)

cormat$top3 <- NA

for(i in unique(cormat$Var2)){
  for(j in refdata){
    # identify top3 match indices
    ind <- which(cormat$Var2 == i & cormat$Var1_source == j)
    val <- cormat$value[ind]
    top3ind <- ind[order(val, decreasing = TRUE)[1:3]]
    
    # assign match ranking to correlation data for plotting
    cormat$top3[top3ind[1]] <- "1"
    cormat$top3[top3ind[2]] <- "2"
    cormat$top3[top3ind[3]] <- "3"
    
    # assign top matches to topmatches dataframe
    topmatch[topmatch$cluster == i, paste0(j, ".top1")] <- gsub(paste0(j, "_"), "", as.character(cormat$Var1[top3ind[1]]))
    topmatch[topmatch$cluster == i, paste0(j, ".top2")] <- gsub(paste0(j, "_"), "", as.character(cormat$Var1[top3ind[2]]))
    topmatch[topmatch$cluster == i, paste0(j, ".top3")] <- gsub(paste0(j, "_"), "", as.character(cormat$Var1[top3ind[3]]))
  }
}

```

## Plots

```{r}
# plotting function
clustAnnoPlot <- function(dat, query, reference) {
  
  dat <- dat[which(dat$Var2_source == query & dat$Var1_source == reference), ]
  
  p <- ggplot(data = dat, 
              aes(Var1, Var2, fill = value)) +
    geom_tile(colour = "white") +
    scale_fill_gradient(low = 'white', high = 'red',
                        name = "Spearman\nCorrelation") +
    geom_point(aes(Var1, Var2, alpha = top3),
               size = 1.5, shape = 19, stroke  = 0) +
    scale_alpha_manual(values = c(1, 0.5, 0.25), 
                       breaks = c(1, 2, 3),
                       name = "top3", na.value = 0) +
    coord_fixed(ratio = 1) +
    xlab("reference") +
    ylab("query") +
    labs(caption = "For each celltype in query, black points represent top 3 celltypes from reference with highest correlation.",
         title = paste0(query, " vs. ", reference)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, size = 8, 
                                     hjust=1, vjust = 0.5),
          axis.text.y = element_text(size=8),
          axis.ticks.length = unit(0.15, units = c('lines')),
          legend.title = element_text(size = 10),
          axis.title = element_text(size = 8),
          plot.caption = element_text(size = 7),
          panel.grid = element_blank(),
          panel.border = element_blank(),
          legend.key.size = unit(0.8, units = c('lines')))
  
  return(p)
}

```

### Against Vento-Tormo et al
```{r}
## Annotation plots
anno.vento <- clustAnnoPlot(dat = cormat, query = "clust", reference = "vento")
cowplot::ggsave2(anno.vento, device = "pdf", width = 7.5, height = 6.5, units = "in",
                 filename = "../results/02_annotation/plots/annotation-by-correlation_ref-vento.pdf")
print(anno.vento)
```

### Against Suryavanshi et al
```{r}
## Annotation plots
anno.surya <- clustAnnoPlot(dat = cormat, query = "clust", reference = "surya")
cowplot::ggsave2(anno.surya, device = "pdf", width = 7.5, height = 6.5, units = "in",
                 filename = "../results/02_annotation/plots/annotation-by-correlation_ref-surya.pdf")
print(anno.surya)
```

### Against Pavlicev et al
```{r fig.asp=0.9}
## Annotation plots
anno.pavli <- clustAnnoPlot(dat = cormat, query = "clust", reference = "pavli")
cowplot::ggsave2(anno.pavli, device = "pdf", width = 7, height = 6, units = "in",
                 filename = "../results/02_annotation/plots/annotation-by-correlation_ref-pavli.pdf")
print(anno.pavli)
```

## Top matches
```{r}
topmatch %>% kable(caption = "Top 3 matches against all reference datasets.") %>% kable_styling(full_width = FALSE, font_size = 10)

```

## Labels for clusters
We need to create consistent labels for all cell types that we identify. Below is a `list` I created in which the top level objects are labels we will give to the cell types, within which contained are lists of corresponding labels from `vento` and `surya`. These are tentative labels. After the first pass, when we go through the clusters with a fine-toothed comb, we can modify them further. For example, if we have multiple clusters labeled as `dec.DSC` (decidual macrophages), and if those clusters are distinct, we can then break up this label into `dec.DSC1` and `dec.DSC2`. 

```{r}
labs <- list("dec.DSC"    = list("vento.lab" = c("dS1", "dS2", "dS3"),
                                 "surya.lab" = c("dec.DSC", "dec.FB1", "dec.FB2")),
             "DC"     = list("vento.lab" = c("DC1", "DC2"),
                                 "surya.lab" = c("dec.DC1", "dec.DC2")),
             "Mac"    = list("vento.lab" = c("dM1", "dM2", "dM3"),
                                 "surya.lab" = c("dec.MAC")),
             "NK"     = list("vento.lab" = c("dNK.p", "dNK1", "dNK2", "dNK3", "NK.CD16neg", "NK.CD16pos"),
                                 "surya.lab" = c("dec.NK1", "dec.NK2")),
             "dec.SMC"    = list("vento.lab" = c("dP1", "dP2"),
                                 "surya.lab" = c("dec.SMC")),
             "dec.Endo"   = list("vento.lab" = c("Endo.m"),
                                 "surya.lab" = c("dec.VEC")),
             "dec.Epi"    = list("vento.lab" = c("Epi1", "Epi2"),
                                 "surya.lab" = c("dec.EEC")),
             "Tcell"  = list("vento.lab" = c("Tcells"),
                                 "surya.lab" = c("dec.TC")),
             "vil.Endo"   = list("vento.lab" = c("Endo.f"),
                                 "surya.lab" = c("vil.VEC")),
             "vil.EVT"    = list("vento.lab" = c("EVT"),
                                 "surya.lab" = c("vil.EVT")),
             "vil.FB"     = list("vento.lab" = c("fFB1", "fFB2"),
                                 "surya.lab" = c("vil.FB1", "vil.FB2", "vil.FB3")),
             "vil.Hofb"   = list("vento.lab" = c("HB"),
                                 "surya.lab" = c("vil.HC")),
             "vil.SCT"    = list("vento.lab" = c("SCT"),
                                 "surya.lab" = c("vil.SCT")),
             "vil.VCT"    = list("vento.lab" = c("VCT"),
                                 "surya.lab" = c("vil.VCT")),
             "Gran"   = list("vento.lab" = c("Granulocytes"),
                                 "surya.lab" = c()),
             "ILC"    = list("vento.lab" = c("ILC3"),
                                 "surya.lab" = c()),
             "Mono"   = list("vento.lab" = c("MO"),
                                 "surya.lab" = c()),
             "Plasma" = list("vento.lab" = c("Plasma"),
                                 "surya.lab" = c()),
             "EB"     = list("vento.lab" = c(),
                                 "surya.lab" = c("vil.EB")),
             "unk.Endo.L" = list("vento.lab" = c("Endo.L"),
                                 "surya.lab" = c("dec.LEC"))
)

```

## Matches consistent between reference datasets
The following clusters have consistent top1 match between `vento` and `surya` references, i.e. they correspond to the same cell type category. The `pavli` reference is too unresolved to be used diagnostically, so we will ignore it for now. For some cell types it is confirmatory, though, e.g. cluster_00 corresponds to DSC in all three references. 

```{r}
# find out which clusters are consistent.
# If vento.top1 and surya.top1 are found in the same item in the labs list, the mapping is consistent.
consistent <- c(NA)
for(i in 1:nrow(topmatch)){
  consistent[i] <- grep(topmatch$vento.top1[i], labs) == grep(topmatch$surya.top1[i], labs)
}

# subset to include consistent set
topmatch.cons <- topmatch %>% 
  filter(consistent) %>% 
  dplyr::select(cluster, vento.top1, surya.top1)

# add our tentative labels to the clusters
for(i in 1:nrow(topmatch.cons)){
  topmatch.cons$label[i] <- names(labs)[grep(topmatch.cons$vento.top1[i], labs)]
}

# print
topmatch.cons[order(topmatch.cons$label), ] %>% knitr::kable() %>% kable_styling(full_width = FALSE)
```

## Ambiguous clusters
The top1 matches for some clusters with `vento` and `surya` are not the same. The are the clusters that will require a more detailed look to determine their identify. 

```{r}
# subset for ambiguous clusters
topmatch.ambig <- topmatch %>% 
  filter(!consistent) %>% 
  dplyr::select(cluster, vento.top1, surya.top1)

# add our labels.
for(i in 1:nrow(topmatch.ambig)){
  topmatch.ambig$label[i] <- paste0(
    names(labs)[grep(topmatch.ambig$vento.top1[i], labs)],
    " or ",
    names(labs)[grep(topmatch.ambig$surya.top1[i], labs)]
    )
}

# print
topmatch.ambig[order(topmatch.ambig$label), ] %>% kable() %>% kable_styling(full_width = FALSE)
```

# Annotation refinement
The annotations we have so far are only a starting point. Now we can look at each annotation more carefully to make sure that everything makes sense.

## Sample contribution to clusters
One of the ways in which we can refine the clusters further is by knowing which tissue that sample arises from. For example, if a cluster has macrophage gene signature and if most cells in that cluster come from villi samples, we can further narrow down the identity of the cluster to Hofbauer cells. We can do this by simply counting for each cluster how many cells come from decidua vs villi and by calculating the percentage. 

```{r fig.asp=1.2, fig.width=6.5}
# count cells in each cluster by tissue of origin
bytissue <- table(seur$tissue, seur$seurat_clusters) %>% 
  as.data.frame() %>% 
  dplyr::rename(tissue = Var1, cluster = Var2, frequency = Freq)

# calculate fraction
for(i in 1:nrow(bytissue)){
  bytissue$fraction[i] <- bytissue$frequency[i]/
    sum(bytissue$frequency[bytissue$cluster == bytissue$cluster[i]])
}

# plot
p.cells <- ggplot(data = bytissue, aes(x = frequency, y = cluster)) +
  geom_bar(aes(fill = tissue), stat = "identity", position = "stack") +
  ggtitle("Number of cells")

p.frac <- ggplot(data = bytissue, aes(x = fraction, y = cluster)) +
  geom_bar(aes(fill = tissue), stat = "identity", position = "stack") +
  ggtitle("Fraction of cells")

p.cells + p.frac + plot_layout(guides = "collect", nrow = 2) +
  plot_annotation(caption = "Sample contribution to clusters") &
  coord_flip() &
  scale_fill_tableau(palette = "Classic 10 Medium") &
  theme_minimal() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_line(size = 0.25),
        plot.title = element_text(size = 10),
        axis.text.x = element_text(angle = 90, vjust = 0.5))
  

```
Following clusters have more than 70% cells coming from the same tissue of origin. 

```{r}
bytissue[, c("cluster", "tissue", "fraction")] %>% 
  pivot_wider(names_from = "tissue", values_from = "fraction") %>% 
  filter(decidua > 0.70 | villi > 0.70) %>% 
  kable(digits = 2, row.names = FALSE, 
        caption = "Clusters with more than 70% cells from one tissue") %>%
  kable_styling(full_width = FALSE)
```

Following clusters have ambiguous origin, i.e. they contain cells from decidua and villi samples. 

```{r}
bytissue[, c("cluster", "tissue", "fraction")] %>% 
  pivot_wider(names_from = "tissue", values_from = "fraction") %>% 
  filter(decidua < 0.70 & villi < 0.70) %>% 
  kable(digits = 2, row.names = FALSE, 
        caption = "Clusters with cells from both tissues") %>%
  kable_styling(full_width = FALSE)
```

This is largely consistent with the putative annotations. All putatively trophoblast clusters come overwhelmingly from villi samples. The only exception is extravillous trophoblast cluster, which as expected, comes from decidua samples. At least in the species that I have worked with, it is often difficult to separate the fetal tissue from maternal tissue. Therefore, we use this analysis only as a rough guide, i.e. when there is a conflict between the two, the marker gene-based annotation should take precedence over tissue of origin in cell type id because the latter is experimentally (and as a consequence, analytically) messy to disentangle.

Most clusters that have mixed origin, are putative immune cell type clusters. This could suggest that the immune cell types are present on both maternal and fetal sides of the interface, and because they are the same cell type, they end up in the same cluster. This means that we can't confidently say for immune cells whether they are decidual or villous. For this reason, when such ambiguity is present, we will label them without the tissue prefix, i.e. for example `Tcell` instead of `dec.Tcell` or `vil.Tcell`.

## Marker genes
### Identify marker genes 
I ran `Seurat::findAllMarkers` on this data using the `SCT` assay. This function outputs a list of marker genes for each cluster compared to all other cells in the data. This takes a while (hours) to run, so I ran it on the cluster and saved the results to a `.csv`. See the `code` directory for full code, and see below for the function call. 

```{r echo=TRUE, eval=FALSE}
# set default assay to SCT
DefaultAssay(object = plac) <- "SCT"

# find markers for all clusters
markers <- FindAllMarkers(object = plac, 
                          only.pos = TRUE,
                          logfc.threshold = 0.25)  

# write.csv
write.csv(markers, "/home/arc78/scratch60/covid-placenta/markers_sct.csv", row.names = FALSE)
```

```{r}
markers <- read.csv("../results/02_annotation/files/markers_sct.csv")

# rename clusters
markers$cluster <- paste0("clust_", markers$cluster)
markers$cluster <- gsub("(clust_)(\\d)$", "\\10\\2", markers$cluster)
markers$cluster <- factor(markers$cluster, levels = unique(markers$cluster))

# split by cluster
markers <- split.data.frame(markers, markers$cluster)
```

### Plot marker genes
We will plot top 50 marker genes for all clusters and save the plot to pdf. 
```{r}
# plotting function
DotPlot2 <- function(object = seur, assay = "SCT", features, title, ...) {
  p <- DotPlot(object, 
               assay = assay,
               features = features, 
               dot.min = 0.05, dot.scale = 4) +
    coord_flip() +
    labs(caption = paste0(assay, " normalized expression"), title = title) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
          panel.grid = element_blank())
  
  return(p)
}
```

```{r fig.asp=1}
top50marker.plots <- list()
for(i in names(markers)) {
  if(nrow(markers[[i]]) > 50)
    features <- markers[[i]]$gene[1:50]
  if(nrow(markers[[i]]) < 50)
    features <- markers[[i]]$gene

  top50marker.plots[[i]] <- DotPlot2(features = features, title = i)
         
  cowplot::ggsave2(top50marker.plots[[i]], 
                   filename = paste0("../results/02_annotation/plots/top50markers_dotplot_", i, ".pdf"),
                   width = 7.5, height = 7.5, units = "in")
}
```

# Manual annotation
With the marker gene plots we have made, we can examine the top marker genes in each cluster to manually confirm or adjust the annotation of each cluster.

Let's make a dataframe in which to keep track of the assignments.

```{r}
annotation <- data.frame(
  cluster = unique(seur$seurat_clusters) %>% sort(),
  annotation = NA,
  notes = NA
)
```

For each set of marker genes, we can also perform a number of manual checks to assess cell type annotation.  

First is GO enrichment using `clusterProfiler` package, for which below is a function for quickly looking at top 10 enriched GO categories.
```{r}
# function for GO enrichment
# 1. for a given cluster, prints n.print.rows top enriched GO terms
# 2. Can exclude ribosomal genes (default) because they dominate some GO enrichments
enrichGO2 <- function(clust, ngenes = 100, include.ribo = FALSE, n.print.rows = 10, ...) {
  
  ids.bitr <- bitr(markers[[clust]]$gene[1:ngenes], 
                fromType = "SYMBOL", toType = c("ENTREZID", "SYMBOL"), 
                OrgDb = "org.Hs.eg.db")
    
  if(include.ribo == FALSE)
    features <- ids.bitr$ENTREZID[!(grepl("RP[SL]", ids.bitr$SYMBOL))]
  if(include.ribo == TRUE)
    features <- ids.bitr$ENTREZID

  ego <- enrichGO(
    gene          = features,    
    OrgDb         = org.Hs.eg.db,
    ont           = "BP",
    pAdjustMethod = "BH",
    pvalueCutoff  = 0.01, # default 0.01
    qvalueCutoff  = 0.05, # default 0.05
    readable = TRUE)
  
  dftoprint <- clusterProfiler::simplify(ego)@result[1:n.print.rows, c(2,8)]
  rownames(dftoprint) <- NULL
  
  return(dftoprint)
}
```

We can also check if the cluster is enriched is genes that are labeled as markers genes for a cell type by other studies. [@vento-tormo_single-cell_2018] have provided top 30 marker genes for all cell types identified by them.

```{r message=FALSE}
# read top 30 markers from vento-tormo et al
markers.vento <- readxl::read_excel("../ext/from-papers/vento-tormo_2018/41586_2018_698_MOESM1_ESM/Supplementary Table 2.xlsx")

# rename columns
markers.vento <- dplyr::rename(markers.vento,
                               VCT_1 = VCT...2,
                               Tcells_1 = `T cells...5`,
                               Tcells_2 = `T cells...9`,
                               VCT_2 = `VCT...10`,
                               Tcells_3 = `T cells...11`,
                               FB_1 = F1,
                               EVT_1 = `EVT...14`,
                               Tcells_4 = `T cells...15`,
                               EVT_2 = `EVT...16`,
                               NKCD16pos = `Blood NK CD16+`,
                               MO_1 = `MO...22`,
                               MO_2 = `MO...27`,
                               NKCD16neg = `Blood NK CD16-`,
                               FB2 = F2,
                               Endo.m = `Endo (m)`,
                               Endo.L = `Endo L`,
                               Endo.f = `Endo (f)` 
)

# remove ensembl ids
markers.vento <- apply(X = markers.vento, MARGIN = 2, FUN = function(x) gsub("_ENSG\\d+", "", x)) %>% 
  as.data.frame()
```

[@suryawanshi_single-cell_2018] have also provided a list of top30 marker genes in the supplementary data. 
```{r}
markers.surya.vil <- readxl::read_excel("../ext/from-papers/suryawanshi_2018/supp-data/aau4788_Data_file_S3.xlsx", sheet = "Villi", range = "A1:G271", trim_ws = TRUE)
markers.surya.dec <- readxl::read_excel("../ext/from-papers/suryawanshi_2018/supp-data/aau4788_Data_file_S3.xlsx", sheet = "Deciuda", range = "A1:G331", trim_ws = TRUE)

markers.surya.vil <- dplyr::rename(markers.surya.vil, cluster = `cell type`)

markers.surya.vil$cluster <- paste0("vil.", markers.surya.vil$cluster)
markers.surya.dec$`cluster` <- paste0("dec.", markers.surya.dec$`cluster`)

markers.surya <- rbind(markers.surya.vil, markers.surya.dec)
```


## DSC
Clusters 0 and 18.

### Notes
1. Clusters 0 and 18 highly correlated with each other and most similar to each other than to other clusters. 
2. Both are consistently similar to the decidual stromal cells in both vento and surya datasets. 
3. The marker genes include Prolactin, IGFBP1 etc, which are typical of decidual stromal cells. 
4. These two clusters also arise unabiguously from the decidua samples rather than villi samples.

```{r}
annotation$annotation[annotation$cluster %in% c("clust_00")] <- "dec.DSC_1"
annotation$annotation[annotation$cluster %in% c("clust_18")] <- "dec.DSC_2"

annotation[annotation$cluster %in% paste0("clust_", c("00", "18")), ]
```

### Marker genes plots
```{r fig.asp=1}
top50marker.plots[["clust_00"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_18"]]
```

### Go anrichment
```{r}
enrichGO2("clust_00")
```

### Vento markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$dS3, title = "vento: dS3")
```

### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.DSC"], title = "surya markers: DSC")
```

## Endothelial cells
Clusters 4 and 21.

### Notes

1. Clusters 4 and 21 are highly correlated. 
2. They consistently get assigned as decidual endothelial cells by both vento and surya datasets.
3. More than 75% of the cells in both of these clusters come from the decidua samples.
4. Markers genes of both clusters are enriched in GO categories related to epithelium development, endothelium development etc. 
5. But these two clusters are not as alike as clusters 0 and 18. They are different in terms of their expression of many endothelium related genes like CD34, PROX1, VWF, SPARCL. Cluster 21 looks more like "typical" endothelial cells. Cluster 4 has low expression of many typical endothelial genes. 
6. Cluster 4 has higher expression of some lymphatic endothelial cells: PROX1, LYVE1. Many genes from the list of surya marker genes for dec.LEC are expressed in cluster 4. This suggests that Cluster 4 may be lymphatic endothelial cells and cluster 21 may be endothelial cells. 

```{r}
annotation$annotation[annotation$cluster %in% c("clust_04")] <- "dec.Endo.L"
annotation$annotation[annotation$cluster %in% c("clust_21")] <- "dec.Endo"

annotation[annotation$cluster %in% paste0("clust_", c("04", "21")), ]
```

### Marker gene plots
```{r, fig.asp=1}
top50marker.plots[["clust_04"]]
```

```{r, fig.asp=1}
top50marker.plots[["clust_21"]]
```

### GO enrichment
```{r}
enrichGO2("clust_04")
```

```{r}
enrichGO2("clust_21")
```

### Vento markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$Endo.m, title = "vento: Endo.m")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$Endo.f, title = "vento: Endo.f")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$Endo.L, title = "vento: Endo.L")
```

### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.VEC"], title = "surya markers: vil.VEC")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.VEC"], title = "surya markers: dec.VEC")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.LEC"], title = "surya markers: dec.LEC")
```

## EVT
Cluster 13. 

### Notes

1. This cluster is unique in its expression of HLA-G, typical of extravillous trophoblast. 
2. Several other genes often enriched in EVT are also enriched in this cluster: FN1, PAPPA2, DIO2, etc.
3. top 30 markers associated with EVT_2 from vento dataset are also highly and specifically expressed in this cluster. 
4. It is consistently assigned to be EVT based on vento, surya, and pavli datasets. 

Annotation:  
**Cluster 13: Extravillous trophoblast (vil.EVT)**

```{r}
annotation$annotation[annotation$cluster %in% c("clust_13")] <- "vil.EVT"

annotation[annotation$cluster %in% c("clust_13"), ]
```

### Marker genes plot
```{r fig.asp=1}
top50marker.plots[["clust_13"]]
```

### GO enrichment
```{r message=FALSE}
enrichGO2("clust_13")
```

### Vento markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$EVT_1, title = "vento markers: EVT_1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$EVT_2, title = "vento markers: EVT_2")
```

### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.EVT"], title = "surya markers: vil.EVT")
```

## VCT and SCT
clusters 16, 17, 23, 24, 25, and 29.

### Notes
For more information on expression patterns and markers of different trophoblast types, see [@liu_single-cell_2018].

1. In addition to cluster 13 (EVT), there are 6 other clusters that are likely trophoblast cell types (they express KRT7 and are related clusters). These 6 clusters are organized into two correlated blocks (see "correlations section above"): (16, 17, 25) and (23, 29, 24) of which the former are likely VCT and the latter SCT according to correlation-based annotation. 
2. Cluster 29 appears to be SCT given its expression of CGA, GDF15, ERVFRD-1 (Syncytin 2) genes. For some reason GO enrichment didn't work for markers of cluster 29, but it does consistently express SCT markers from vento and surya.
3. Cluster 24 doesn't express ERVFRD-1 much, but it does express most other telltale genes of SCT: CSH1, CSH2, CGA, CSHL1, many PSG genes, GH2, PGF, GDF15, etc. And it is not enriched for expression of VCT markers from vento and surya. It's enriched GO terms have to do with hormone production and signaling as expected for SCT. Thus, this is most likely an SCT cluster. 
4. Clusters 16 and 17 don't express many markers of SCT, but do express many vento and surya markers of VCT: PAGE4, PEG10, XAGE3, etc.
5. Cluster 25 has VCT markers as well as a strong signature of cell division genes. This shows up in the GO enrichment as well, where most enriched GO terms are "nuclear division", "chromosome segregation", "cell cycle G1/S phase transition" etc. This is likely a cluster of proliferating VCT cells. Incidentally both [@vento-tormo_single-cell_2018] and [@liu_single-cell_2018] have a separate population of VCT with a cell cycle gene signature (VCT_2 in vento). 
6. Cluster 23 is most similar to SCT of vento but VCT of surya. Indeed this cluster has expression of marker genes for both VCT and SCT (see plots). It expresses PAGE4, PEG10 etc, but also expresses syncytin gene ERVW-1. This could represent a group of cytotrophoblast cells that are in the process of differentiating to sycnytial trophoblast. We will label it as a VCT type. 

```{r}
annotation$annotation[annotation$cluster %in% c("clust_16")] <- "vil.VCT_1"
annotation$annotation[annotation$cluster %in% c("clust_17")] <- "vil.VCT_2"
annotation$annotation[annotation$cluster %in% c("clust_23")] <- "vil.VCT_3"
annotation$annotation[annotation$cluster %in% c("clust_25")] <- "vil.VCT_4"
annotation$annotation[annotation$cluster %in% c("clust_24")] <- "vil.SCT_1"
annotation$annotation[annotation$cluster %in% c("clust_29")] <- "vil.SCT_2"

annotation$notes[annotation$cluster %in% c("clust_25")] <- "proliferating"

annotation[annotation$cluster %in% paste0("clust_", c("16", "17", "23", "25", "24", "29")), ]
```

### Marker gene plots
```{r fig.asp=1}
top50marker.plots[["clust_16"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_17"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_25"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_23"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_24"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_29"]]
```

### GO enrichment
```{r message=FALSE}
enrichGO2("clust_16")
```

```{r message=FALSE}
enrichGO2("clust_17")
```

```{r message=FALSE}
enrichGO2("clust_25")
```

```{r message=FALSE}
enrichGO2("clust_23")
```

```{r message=FALSE}
enrichGO2("clust_24")
```

```{r message=FALSE}
enrichGO2("clust_29")
```

### Vento markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$VCT_1, title = "vento markers: VCT_1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$VCT_2, title = "vento markers: VCT_2")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$SCT, title = "vento markers: SCT")
```

### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.VCT"], title = "surya markers: vil.VCT")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.SCT"], title = "surya markers: vil.SCT")
```

### Pavli markers
```{r fig.asp=0.4, warning=FALSE}
DotPlot2(features = c("ADRB1", "PUF60", "SNORDA3A", "PRMT7", "E1F1AY", 
                      "FXYD3", "GRAMD2", "INSL4", "ITGB8", "PAGE4",
                      "SLC13A4", "SLC22A11"), 
         title = "pavli markers: VCT_1")
```


```{r fig.asp=0.6, warning=FALSE}
DotPlot2(features = c("XAGE3", "XAGE2", "SERINC5", "S100P", "RASA1",
                      "MFAP5", "LIN28B", "KRT8", "KRT7", "INS-IGF2", "GCM1",
                      "GATA3", "ERVW-1", "ERVFRD-1", "EGR1", "EFNA1", 
                      "CYP19A1", "PHLDA2", "ACKR2"), 
         title = "pavli markers: VCT_2")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = c("SEMA3B", "CSH2", "CSHL1", "FCGR2A", "GH2", "HBA1", 
                      "HBA2", "HBB", "HBG1", "HBG2", "HLA-DMA", "HLA-DPA1",
                      "HLA-DQB1", "HLA-DRA", "HLA-DRB1", "HPGDS", "CGB8",
                      "LGALS14", "LYVE1", "NPIPB3", "CGB5", "PSG1", "PSG3", 
                      "PSG6", "PSG9", "ZNF117", "ZNF91", "HIST2H2AB",
                      "RAMP2", "MUC20"), 
         title = "pavli markers: SCT")
```

### Liu et al markers for trophoblasts
```{r fig.asp=0.5}
DotPlot2(seur, features = c("GCM1", "CSH1", "KRT7", "TFAP2C", "GATA3", "CDH1", "EGFR", "HLA-G", "MMP2", "RRM2", "CCNB1", "CDK1", "ERVFRD-1", "SLC1A5", "PAGE4", "CGB"), title = "Genes from Liu et al 2018")
```
## Fibroblasts and SMC
Clusters 02, 06, 07, 15, 19, and 33.

### Notes

1. Most of these clusters represent some kind of fibroblast or a closely related cell type. Most clusters are enriched for GO terms related to ECM regulation. 
2. Clusters 02 and 15 marker genes are also highly expressed in clusters 00 and 18. Cluster 15 has more than 75% cells coming from decidua samples, and cluster 02 has over 50% cells coming from decidua samples. Decidual stromal cells are known to be actually a few different populations. Even in Vento-tormo et al, they are labeled as dS1--3, of which dS3 are the canonical DSC with PRL and IGFBP1, while dS1 and dS2 are closely related to DSC and are derived from endometrial stromal fibroblasts. These things together suggest that clusters 02 and 15 are the non-PRL varieties of DSC. To distinguish them from DSC, we can call them decidual fibroblasts. 
3. Marker genes of 06, 07, and 33 show highly correlated gene expressin, i.e. the markers on each of these clusters are also highly expressed in the other two clusters, suggesting that these three clusters a closely related cell populations. All three of these clusters are largely (over 75% of the cells) derived from villi samples, and they are fibroblast-like cells. Thus, these are likely villous fibroblasts.
4. Cluster 19 is most similar to PV (perivascular) clusters from vento and SMC (smooth muscle cell) cluster from surya. Indeed, its markers include smooth muscle genes like MGP, MYH11, MYL9 etc, and enriched GO terms for this cluster include "muscle contraction" and "artery morphogenesis". Thus, this cluster is most likely of perivascular smooth muscle cells of decidual origin (over 75% cells from decidua samples). 

```{r}
annotation$annotation[annotation$cluster %in% c("clust_02")] <- "dec.FB_1"
annotation$annotation[annotation$cluster %in% c("clust_15")] <- "dec.FB_2"
annotation$annotation[annotation$cluster %in% c("clust_06")] <- "vil.FB_1"
annotation$annotation[annotation$cluster %in% c("clust_07")] <- "vil.FB_2"
annotation$annotation[annotation$cluster %in% c("clust_33")] <- "vil.FB_3"
annotation$annotation[annotation$cluster %in% c("clust_19")] <- "dec.SMC"

annotation[annotation$cluster %in% paste0("clust_", c("02", "15", "06", "07", "33", "19")), ]
```

### Marker gene plots
```{r fig.asp=1}
top50marker.plots[["clust_02"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_15"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_06"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_07"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_19"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_33"]]
```

### GO enrichment
```{r message=FALSE}
enrichGO2("clust_02")
```

```{r message=FALSE}
enrichGO2("clust_15")
```

```{r message=FALSE}
enrichGO2("clust_06")
```

```{r message=FALSE}
enrichGO2("clust_07")
```

```{r message=FALSE}
enrichGO2("clust_19")
```

```{r message=FALSE}
enrichGO2("clust_33")
```

### Vento markers

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$PV1, title = "vento markers: PV1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$PV2, title = "vento markers: PV2")
```

### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.FB1"], title = "surya markers: vil.FB1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.SMC"], title = "surya markers: dec.SMC")
```

## B cells
Cluster 20.

### Notes
This is most likely a cluster of B cells. Most markers genes are related to B cells: many immunoglobulin gnees, SPIB, MS4A1, CD79A, CD79B, BANK1, etc. It also expresses many genes that are markers for Plasma cluster from vento dataset. GO enrichment also agrees.

```{r}
annotation$annotation[annotation$cluster %in% c("clust_20")] <- "Bcell"

annotation[annotation$cluster %in% paste0("clust_", c("20")), ]
```

### Marker genes plot
```{r fig.asp=1}
top50marker.plots[["clust_20"]]
```

### GO enrichment
```{r message=FALSE}
enrichGO2("clust_20")
```

### Vento markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$Plasma, title = "vento markers: Plasma")
```

## Other lymphocytes
Clusters  05, 09, 10, 14, 30, and 32.

### Notes
For clues on marker genes for various types of lymphocytes see [@pizzolato_single-cell_2019].

1. These are likely non-B lymphoid cells. 
2. Clusters 09 and 32 are most similar to Tcells from both vento and surya. Given the expression of CD3 genes, that appears to be the correct assignment. 
     + The clusters also express TRAC and TRBC1 (constant regions of TCR alpha and beta), making them alpha-beta T cells. 
     + It's hard to tell if they are CD4 or CD8---both CD4 and CD8 genes are expressed by a small fraction of cells in both clusters, but CD8 genes expressed at a higher level. 
     + GO enrichment for both include terms related to T cell differentiation, T cell activation etc. 
     + These clusters contain cells from both decidua and villi samples. 
3. Cluster 05 is most similar to NK cells from vento and surya. But based on the lack of NKG7 and NCAM1 expression and expression of CD3 genes, TRAC, and TRBC1 this is likely a T cell cluster. One of its marker genes being GZMA and its overall similarity with NK cells suggest that this may be a cytotoxic T cell. Consistently, among its enriched GO terms are "T cell mediated cytotoxicity" and "Cell killing". This cluster mostly contains cells from decidua samples.
4. Clusters 10, 14, and 30 are NK cells based on the expression of NKG7 and NCAM1. 
     + These clusters are also express GZMA, GZMB, GNLY, PRF1, KLRB1, KLRC1, KLRD1. 
     + Cluster 30 is enriched for cell cycle genes, so represents proliferating cells. Vento and surya datasets also have a cluster of proliferating NK cells (dNK.p in vento and NK2 in surya). +
     + Cluster 30 in unambiguously of decidual sample origin. Clusters 10 and 14 are mixed. 
     
```{r}
annotation$annotation[annotation$cluster %in% c("clust_05")] <- "Tcell_1"
annotation$annotation[annotation$cluster %in% c("clust_09")] <- "Tcell_2"
annotation$annotation[annotation$cluster %in% c("clust_10")] <- "NK_1"
annotation$annotation[annotation$cluster %in% c("clust_14")] <- "NK_2"
annotation$annotation[annotation$cluster %in% c("clust_30")] <- "NK_3"
annotation$annotation[annotation$cluster %in% c("clust_32")] <- "Tcell_3"

annotation$notes[annotation$cluster %in% c("clust_05")] <- "cytotoxic"
annotation$notes[annotation$cluster %in% c("clust_30")] <- "proliferating"


annotation[annotation$cluster %in% paste0("clust_", c("05", "09", "10", "14", "30", "32")), ]
```

### Lymphocyte candidate genes
```{r fig.asp=0.5}
DotPlot2(features = c("NCAM1", "NKG7", "CD3D", "CD3G", "CD3E", "TRAC", "TRBC1", "TRGC2", "TRDC", "CD8A", "CD8B", "CD4"), title = "Lymphocyte candidate genes")
```

### Marker gene plots
```{r fig.asp=1}
top50marker.plots[["clust_05"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_09"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_10"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_14"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_30"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_32"]]
```

### GO enrichment
```{r message=FALSE}
enrichGO2("clust_05")
```

```{r message=FALSE}
enrichGO2("clust_09")
```

```{r message=FALSE}
enrichGO2("clust_10")
```

```{r message=FALSE}
enrichGO2("clust_14")
```

```{r message=FALSE}
enrichGO2("clust_30")
```

```{r message=FALSE}
enrichGO2("clust_32")
```

### Vento markers

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$Tcells_2, title = "vento markers: Tcells_2")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$Tcells_4, title = "vento markers: Tcells_4")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$dNK1, title = "vento markers: dNK1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$dNKp, title = "vento markers: dNKp")
```


### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.NK1"], title = "surya markers: dec.NK1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.NK2"], title = "surya markers: dec.NK2")
```

## Myeloid cells
Clusters 01, 03, 08, 11, 12, 22, 26, 27, 28, 31, and 34.

### Notes

1. Clusters 01, 22, 31, 34, and 27 form a group in the hierarchical clustering above, and they all have most cells (more than 75%) coming from villi samples. 
    + All five clusters are highly highly correlated with each other, especially clusters 01, 31, and 34. 
    + All clusters express various macrophage genes: CD68, CD163, MRC1, MAF, C1QC, C1QA, FOLR2 etc; and more so clusters 01, 31, and 34. 
    + Clusters 01 and 31 are most likely villous macrophages (i.e. Hofbauer cells), but the other cluster need more investigation. 
    + Clusters 22 and 27, even though correlated with 01 and 34, have marker genes that are not that highly expressed in themselves but are in 01 and 34. Looking at the UMAP plot below, it's clear that both of these clusters are not as homogenous as the other. The cells are spread between the big myeloid blob and the big fibroblast blob. 
    + However, these don't appear to be clusters of different types of cells. That is, fibroblast genes (COL3A1, COL3A3) and macrophage genes are coexpressed in the same cells in cluster 27 (see plot below). And macrophage genes are broadly expressed in cluster 22, even in cells that seem to be embedded with fibroblast cells. Clusters 22 and 27 might represent cell types that are macrophages but with certain fibroblast-like properties. 
    + Looking at cluster 34 markers genes, it interestingly expresses many erythrocyte lineage genes (HBA1, HBA2, HBM, CD1, HBG1, CD235a/GYPA) but it also expresses macrophage genes like CSF1R, FOLR2, MRC1, CD163 etc. If we plot the expression of these genes on UMAP (plotting only clust_34, see below), we see that majority of the cells express macrophage genes, while the erythrocyte genes are expressed in a small number of cells (except HBG2), suggesting that this cluster is made of two cell types. Since majority of the cells in this cluster are macrophage cells, we will label it as such, but should the need arise in further analysis, we can subcluster it further to separate the two cell types out.
2. Cluster 26 is most likely of erythrocytes (or some cell type from the erythrocyte lineage). 
    + The surya dataset does contain what they call an erythroblast cluster, but in erythropoeisis is expected in the placenta in the first trimester. In our term samples, I am not sure if erythrocyte progenitors are expected to be present. 
    + Marker genes, similarity with surya EB cluster, and GO enrichement (which include "erythrocyte development", "oxygen transport", etc) are all consistent with each other. 
    + Most of the cells in this cluster come from villi samples, so fetal origin.
3. Cluster 28 most likely is made of typical granulocytes. 
    + It expresses the granulocyte marker gene CEACAM8, in addition to CEACAM6, ELANE, and MPO. 
    + GO enrichment is consistent.
    + All marker genes are consistent: LYZ, DEFA3, DEFA4, S100A8, NCF1, etc. 
4. Cluster 11 and 12 are most likely APC.
    + Of maternal origin. Cluster 12 mostly comes from decidua sample, cluster 11 has mixed contribution, in terms of its gene expression it's very similar to cluster 12. 
    + Very similar to each other. Among vento cluster, most similarity to macrophage clusters. Among surya clusters, most similarity to maternal DC and maternal APC clusters.
    + Expression of classical macrophage/DC markers like CD68, CD163. In addition most of the marker genes of this cluster have to do with antigen presentation. Many of them are MHC-II genes. 
    + Not sure if they are macrophages or DCs, but to avoid committing to one conclusion over the other, we can just label them as APCs. 
5. Clusters 3 and 8 are difficult.
    + Low, if any, expression of CD68 and CD163, thus not quite macrophages. But at least cluster 8 is quite similar to macrophages in overall gene expression (see hierarchical clustering above).
    + Low, if any, expression of HLA genes, thus not quite APCs.
    + No expression of typical granulocyte markers: CEACAM8, CEACAM6, MPO, ELANE. But many markers of these clusters sound "granulocytic". This is evident in GO enrichment as well. In addition, cluster 3 is most similar to cluster 28 (see hierarchical clustering above) which is clearly a granulocyte cluster.
    + Cluster 08 is most similar to the MO_1 and MO_2 clusters from vento. Cluster 3 had low similarity with any of the clusters from vento and surya. 
    + Both clusters express MNDA, which is typically only expressed in cells in the monocyte-granulocyte lineage. 
    + Both clusters express LYZ, S100A9, S100A8, which I think are also monocyte genes.
    + These might be monocytes but they don't express CD14. 
    + It's in fact possible to have CD14--ve monocyte, it turns out [@mukherjee_non-classical_2015]. See introduction of [@sampath_monocyte-subsets_2018], which summarizes observations from [@villani_single-cell_2017]. These papers point to non-CD14 monocyte populations. In fact most genes in the Mono_3 cluster of [@villani_single-cell_2017] are expressed in cluster 03, e.g. MXD1, VNN2, CXCR2.
    + Overall it looks like clusters 3 and 8 are two types of monocytes. (at the very least they are from the monocyte-granulocyte lineage.) 
    
```{r}
annotation$annotation[annotation$cluster %in% c("clust_01")] <- "vil.Hofb_1"
annotation$annotation[annotation$cluster %in% c("clust_22")] <- "vil.Hofb_2"
annotation$annotation[annotation$cluster %in% c("clust_27")] <- "vil.Hofb_3"
annotation$annotation[annotation$cluster %in% c("clust_31")] <- "vil.Hofb_4"
annotation$annotation[annotation$cluster %in% c("clust_34")] <- "vil.Hofb_5"

annotation$annotation[annotation$cluster %in% c("clust_03")] <- "Mono_1"
annotation$annotation[annotation$cluster %in% c("clust_08")] <- "Mono_2"
annotation$annotation[annotation$cluster %in% c("clust_11")] <- "APC_1"
annotation$annotation[annotation$cluster %in% c("clust_12")] <- "APC_2"
annotation$annotation[annotation$cluster %in% c("clust_26")] <- "vil.Ery"
annotation$annotation[annotation$cluster %in% c("clust_28")] <- "Gran"

annotation$notes[annotation$cluster %in% c("clust_27")] <- "also fibroblast signature"
annotation$notes[annotation$cluster %in% c("clust_34")] <- "likely also contains erythro"
annotation$notes[annotation$cluster %in% c("clust_03")] <- "unsure"
annotation$notes[annotation$cluster %in% c("clust_08")] <- "unsure"

annotation[annotation$cluster %in% paste0("clust_", 
                                          c("01", "03", "08", "11", "12",
                                            "22", "26", "27", "28", "31", "34")), ]
```

### Myeloid candidate genes
```{r fig.asp=0.5}
DotPlot2(features = c("PTPRC", # pan
                      "CD33", "CD14", "CD68", "CD163", "CLEC10A", "HLA-DRA", # apc
                      "CEACAM8", "CEACAM6", "MPO", "ELANE", # granulocytes
                      "FCGR3A", # inflammatory monocytes?
                      "GYPA" # erythro
                      ), 
         title = "Myeloid candidate genes")
```

### Hofb clusters on UMAP
```{r fig.width=6}
DimPlot(seur, 
        cols = c(rep("Grey90", 30), "#aec7e8", "#ff9e4a", "#2ca02c", "#ed665d", "#9467bd"), 
        order = (rev(c("clust_01", "clust_22", "clust_27", "clust_31", "clust_34")))) + 
  theme_few() +
  coord_fixed()
```
### clust_22 markers on UMAP
```{r fig.asp=0.5}
DefaultAssay(seur) <- "SCT"
p <- FeaturePlot(seur, features = c("CD163", "CSF1R", "C1QA", "MRC1", "CD14", "FOLR2"), 
                 cells = rownames(seur@meta.data)[seur@meta.data$seurat_clusters == "clust_22"],
                 coord.fixed = TRUE, ncol = 3, combine = FALSE) 

p[[1]] + p[[2]] + p[[3]] + p[[4]] + p[[5]] + p[[6]] + 
  plot_layout(ncol = 3) +
  plot_annotation(title = "clust_22") &
  theme_bw() +
  theme(panel.grid.major = element_line(size = 0.25),
        panel.grid.minor = element_blank(),
        legend.key.size = unit(0.75, "line"),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 10)) 
```

### clust_27 markers on UMAP
```{r}
DefaultAssay(seur) <- "SCT"
p <- FeaturePlot(seur, features = c("COL3A1", "CD14", "MRC1", "LYVE1", "COL1A1"), cells = rownames(seur@meta.data)[seur@meta.data$seurat_clusters == "clust_27"],
                 coord.fixed = TRUE, ncol = 3) 
p[[1]] + p[[2]] + p[[3]] + p[[4]] + p[[5]] + 
  plot_layout(ncol = 3) +
  plot_annotation(title = "clust_27") &
  theme_bw() +
  theme(panel.grid.major = element_line(size = 0.25),
        panel.grid.minor = element_blank(),
        legend.key.size = unit(0.75, "line"),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 10)) 
```

### clust_34 markers on UMAP
```{r}
DefaultAssay(seur) <- "SCT"
p <-  FeaturePlot(seur, 
                  features = c("GYPA", "HBM", "HBG2", "HBG1", "ALAS2", "AHSP", 
                               "MRC1", "CD163", "CSF1R"), 
                  cells = rownames(seur@meta.data)[seur@meta.data$seurat_clusters == "clust_34"],
                  coord.fixed = FALSE, ncol = 3)
p[[1]] + p[[2]] + p[[3]] + p[[4]] + p[[5]] + p [[6]] + p[[7]] + p[[8]] + p[[9]] + 
plot_layout(ncol = 3) +
  plot_annotation(title = "clust_34") &
  theme_bw() +
  theme(panel.grid.major = element_line(size = 0.25),
        panel.grid.minor = element_blank(),
        legend.key.size = unit(0.75, "line"),
        axis.title = element_text(size = 6),
        plot.title = element_text(size = 10)) 
```

### Marker gene plots
```{r fig.asp=1}
top50marker.plots[["clust_01"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_03"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_08"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_11"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_12"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_22"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_26"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_27"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_28"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_31"]]
```

```{r fig.asp=1}
top50marker.plots[["clust_34"]]
```

### GO enrichment
```{r message=FALSE}
enrichGO2("clust_01")
```

```{r message=FALSE}
enrichGO2("clust_03")
```

```{r message=FALSE}
enrichGO2("clust_08")
```

```{r message=FALSE}
enrichGO2("clust_11")
```

```{r message=FALSE}
enrichGO2("clust_12")
```

```{r message=FALSE}
enrichGO2("clust_22")
```

```{r message=FALSE}
enrichGO2("clust_26")
```

```{r message=FALSE}
enrichGO2("clust_27")
```

```{r message=FALSE}
enrichGO2("clust_28")
```

```{r message=FALSE}
enrichGO2("clust_31")
```

```{r message=FALSE}
enrichGO2("clust_34")
```

### Vento markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$dM1, title = "vento markers: dM1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$dM2, title = "vento markers: dM2")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$HB, title = "vento markers: HB")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$M3, title = "vento markers: M3")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$MO_1, title = "vento markers: MO_1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$MO_2, title = "vento markers: MO_2")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$DC1, title = "vento markers: DC1")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.vento$DC2, title = "vento markers: DC2")
```

### Surya markers
```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.HC"], title = "surya markers: vil.HC")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.EB"], title = "surya markers: vil.EB")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "dec.APC"], title = "surya markers: dec.APC")
```

```{r fig.asp=0.8, warning=FALSE}
DotPlot2(features = markers.surya$gene[markers.surya$cluster == "vil.HC"], title = "surya markers: vil.HC")
```

# Final annotations
## Merge similar clusters
In addition to the annotations so far, we can have a second set of coarser-grained annotations so that we have fewer clusters to deal with. For this we will merge some similar clusters---mostly non-immune clusters, leaving all immune clusters intact. 

```{r}
annotation$annotation_merged <- annotation$annotation

annotation$annotation_merged[annotation$annotation %in% paste0("vil.Hofb_", c(1:5))] <- "vil.Hofb"
annotation$annotation_merged[annotation$annotation %in% paste0("dec.DSC_", c(1:2))] <- "dec.DSC"
annotation$annotation_merged[annotation$annotation %in% paste0("dec.FB_", c(1:3))] <- "dec.FB"
annotation$annotation_merged[annotation$annotation %in% paste0("vil.FB_", c(1:3))] <- "vil.FB"
annotation$annotation_merged[annotation$annotation %in% paste0("APC_", c(1:2))] <- "APC"
annotation$annotation_merged[annotation$annotation %in% paste0("vil.VCT_", c(1:4))] <- "vil.VCT"
annotation$annotation_merged[annotation$annotation %in% paste0("vil.SCT_", c(1:2))] <- "vil.SCT"
annotation$annotation_merged[annotation$annotation %in% c("dec.Endo", "dec.Endo.L")] <- "dec.Endo"
```

```{r}
# rearrange columns and write annotations to csv
annotation <- annotation[, c("cluster", "annotation", "annotation_merged", "notes")]

write.csv(annotation, "../results/02_annotation/files/annotations.csv", row.names = FALSE)
```


```{r}
annotation %>% kable(caption = "Final annotations of all clusters") %>% kable_styling(full_width = FALSE)
```
## Add annotations to `Seurat` object
```{r}
# add annotations
seur@meta.data$annotation <- annotation$annotation[match(
  x = seur@meta.data$seurat_clusters,
  table = annotation$cluster)
  ]

seur@meta.data$annotation_merged <- annotation$annotation_merged[match(
  x = seur@meta.data$seurat_clusters, 
  table = annotation$cluster)
  ]

```


```{r}
# write seurat object
saveRDS(seur, file = "../data/seurat-object_annotated.rds")
```

## Plots
### UMAP: all annotations
```{r, fig.asp=1, fig.width=7}
Idents(seur) <- seur@meta.data$annotation
DimPlot(seur, label = TRUE, repel = TRUE, label.size = 4, shuffle = TRUE) +
  coord_fixed() +
  theme_minimal() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        axis.title = element_text(size = 8))

ggsave(last_plot(), filename = "../results/02_annotation/plots/umap_annotated.pdf", 
       device = "pdf", width = 7, height = 7, units = "in")
```
### UMAP: merged annotations
```{r, fig.asp=1, fig.width=7}
Idents(seur) <- seur@meta.data$annotation_merged
DimPlot(seur, label = TRUE, repel = TRUE, label.size = 4, shuffle = TRUE) +
  coord_fixed() +
  theme_minimal() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        axis.title = element_text(size = 8))

ggsave(last_plot(), filename = "../results/02_annotation/plots/umap_annotated_merged.pdf", 
       device = "pdf", width = 7, height = 7, units = "in")
```

# Session Info
```{r}
sessionInfo()
```

# References

